Skip to content

Commit

Permalink
working jsonl
Browse files Browse the repository at this point in the history
  • Loading branch information
daanelson committed Jul 11, 2023
1 parent 0aa7f78 commit 622f3f3
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 7 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
example-data/**
parsed_data/**
checkpoints/**
test-data/**
16 changes: 9 additions & 7 deletions process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ def make_tarfile(output_filename, source_dir):
tar.add(full_path, arcname=os.path.relpath(full_path, source_dir))
return Path(output_filename)

def download_file(url, folder, index):
"""Renaming files s.t."""
# extension = url.split('.')[-1]
subprocess.check_call(["pget", url, folder])
return url.split('/')[-1]
def download_file(url, folder):
"""Downloads and stores files"""
file_name = url.split('/')[-1]
subprocess.check_call(["pget", url, os.path.join(folder, file_name)])
return file_name

def write_file(text, folder, audio_fname):
"""Writes text file consistent with audio filename"""
Expand Down Expand Up @@ -90,6 +90,8 @@ def predict(self,
untar(audio_files, self.audio_folder)
untar(text_files, self.text_folder)
elif jsonl_data:
os.mkdir(self.audio_folder)
os.mkdir(self.text_folder)
self.parse_jsonl(jsonl_data)
else:
raise ValueError("You need to pass either audio & text or a jsonl of files")
Expand All @@ -104,9 +106,9 @@ def parse_jsonl(self, jsonl_path):
json_object = json.loads(line)
data.append(json_object)
for ind, row in enumerate(data):
audio_fname = download_file(row['audio'], self.audio_folder, ind)
audio_fname = download_file(row['audio'], self.audio_folder)
if 'https:' in row['sentence']:
download_file(row['sentence'], self.text_folder, ind)
download_file(row['sentence'], self.text_folder)
else:
write_file(row['sentence'], self.text_folder, audio_fname)

Expand Down

0 comments on commit 622f3f3

Please sign in to comment.