This repository has been archived by the owner on Nov 28, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
/
time_aligned_text.py
132 lines (110 loc) · 4.49 KB
/
time_aligned_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
"""
Class for holding time_aligned text
"""
import hashlib
import importlib
import os
from asrtoolkit.file_utils.name_cleaners import (generate_segmented_file_name,
sanitize_hyphens)
class time_aligned_text(object):
"""
Class for storing time-aligned text and converting between formats
"""
location = ""
segments = []
file_extension = None
def __init__(self, input_data=None):
"""
Instantiates a time_aligned text object
If 'input_data' is a string, it tries to find the appropriate file.
>>> transcript = time_aligned_text()
"""
if (input_data is not None and isinstance(input_data, str)
and os.path.exists(input_data)):
self.read(input_data)
elif input_data is not None and type(input_data) in [str, dict]:
self.file_extension = "txt" if isinstance(input_data,
str) else "json"
data_handler = importlib.import_module(
"asrtoolkit.data_handlers.{:}".format(self.file_extension))
self.segments = data_handler.read_in_memory(input_data)
def hash(self):
"""
Returns a sha1 hash of the file
"""
if self.location:
with open(self.location) as f:
return hashlib.sha1(f.read().encode()).hexdigest()
else:
return hashlib.sha1("".encode()).hexdigest()
def __str__(self):
"""
Returns string representation of formatted segments as corresponding
By default, use the extension of the file you loaded
>>> transcript = time_aligned_text()
>>> print(transcript.__str__()=="")
True
"""
data_handler = importlib.import_module(
"asrtoolkit.data_handlers.{:}".format(
self.file_extension if self.file_extension else "txt"))
return "\n".join(_.__str__(data_handler) for _ in self.segments)
def __add__(self, other):
"""
Add two transcripts
Set the location after adding if you want to save this!
"""
new_segments = self.segments + other.segments
# Sort the segments by their start time then stop time
new_segments.sort(key=lambda s: (float(s.start), float(s.stop)))
out_transcript = time_aligned_text()
out_transcript.file_extension = self.file_extension
out_transcript.segments = new_segments
return out_transcript
def text(self):
"""
Returns unformatted text from all segments
"""
data_handler = importlib.import_module(
"asrtoolkit.data_handlers.{:}".format("txt"))
return " ".join(_.__str__(data_handler) for _ in self.segments)
def read(self, file_name):
""" Read a file using class-specific read function """
self.file_extension = file_name.split(".")[-1]
self.location = file_name
data_handler = importlib.import_module(
"asrtoolkit.data_handlers.{:}".format(self.file_extension))
self.segments = data_handler.read_file(file_name)
def write(self, file_name):
"""
Output to file using segment-specific __str__ function
"""
file_extension = file_name.split(
".")[-1] if "." in file_name else "stm"
file_name = sanitize_hyphens(file_name)
data_handler = importlib.import_module(
"asrtoolkit.data_handlers.{:}".format(file_extension))
with open(file_name, "w", encoding="utf-8") as f:
f.write(data_handler.header())
f.writelines(
data_handler.separator.join(
seg.__str__(data_handler) for seg in self.segments))
f.write(data_handler.footer())
# return back new object in case we are updating a list in place
return time_aligned_text(file_name)
def split(self, target_dir):
"""
Split transcript into many pieces based on valid segments of transcript
"""
os.makedirs(target_dir, exist_ok=True)
for iseg, seg in enumerate(self.segments):
new_seg = time_aligned_text()
new_seg.file_extension = self.file_extension
new_seg.location = generate_segmented_file_name(
target_dir, self.location, iseg)
new_seg.segments = [seg]
new_seg.write(new_seg.location)
if __name__ == "__main__":
import doctest
doctest.testmod()