In [2]:
%matplotlib inline

In [3]:
import json, glob, os
from pathlib import Path
import string

## Remove Theme Song words from Assembly AI speech-to-text transcripts

This notebook was used to extract words from the Friends' them song, which were picked up somewhat inconsistently by the Assembly AI speech-to-text universal model. The goal was to produce cleaner transcripts and to improve the alignment with the fan-made transcripts to improve the assignment of speaker tags to the AI transcript.

In [224]:
"""
Some support functions
"""

def prep_aa(fpath):
    """."""
    with open(fpath, 'r') as file:
        aa_transcript = json.load(file)

    return aa_transcript

def get_paths(s, e, ext=""):
    """
    Returns paths to all .json files that match
    an episode number, just in case theme song is 
    not in first half for some odd reason
    """
    return sorted(
        glob.glob(
            "/home/mstlaure/Documents/Marie"
            "/neuromod/algonauts/dev_scripts_features_tseries_ridge"
            "/speech2text_friends_data/json_aa_wUtter/edited"
            f"/s{s}/friends_s0{s}e{e}*_aaUt{ext}.json"               
        )
    )


#key_phrase = "one told you"
key_phrase = "be this way"
#key_phrase = "there for you"

def show_transcripts(s, e, verbose=True, ext=""):
    """
    Flags transcripts that contain theme wong words,
    and return their content

    Prints a version split into halfs (before and 
    after theme song) to ease manual edits
    """
    jfiles = []
    for p in get_paths(s, e, ext):
        jfile = prep_aa(p)
        tr_text = jfile['transcript']
        if key_phrase in tr_text.lower():
            print(os.path.basename(p))
            seg_sizes = []
            for seg in tr_text.split(key_phrase):
                seg_sizes.append(len(seg))
                if verbose:
                    print(f"Segment length in chars: {len(seg)}")
                    print(f"{seg} \n")
            assert len(seg_sizes) ==2
            jfiles.append((p, jfile, seg_sizes))
    return jfiles


#### Keep track of edited episodes

s7 : 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 (complete)

s6 : 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15a, (15c has no theme song), (no 16), 17, 18, 19, 20, 21, 22, 23, 24 (complete)

s5 : 01, 02, (03 has theme song removed), 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 (complete)

s4: only 23d has the theme song at the end

In [235]:
s = "4"
e = "23"

jf = show_transcripts(s, e)

friends_s04e23d_aaUt.json
Segment length in chars: 4954
Well, I've never done that with you before. Nope. So how are you? You okay? Yep. Yep. You? Oh, yes, yes. Uh huh. You. We did you. Well, I better get going. Oh, yeah, absolutely. Could you not look? I don't want to look. Hello? Hey. Where the hell have you been? Hey. I spent the night out. I met this cute bridesmaid. She is so. I. I don't want to hear about her. Aw, Pheeves, you know you're still my number one girl. No, no. We have an emergency. Okay. Rachel's coming to London. Oh, great. Well, it's not great. No, she's coming to tell Ross that she loves him. But he loves Emily. I know that. You have to stop her. She's gonna ruin the wedding. Okay. All right. So. Okay, hold on, hold on. Rachel Cummings, do something. Okay, so then I've done my part. Okay? It's your responsibility now. The burden is off me, right? Right. So tell me about this girl. And so then I realized all the stuff that I had been doing. Proposing to Joshua, lyin

In [236]:
# Check how many transcripts need fixing for that episode
print(len(jf))  

1


In [184]:
# For an episode, process each transcript that has the theme song in it (typically transcript a)
t = 0
out_path = jf[t][0]
td = jf[t][1]

print(os.path.basename(out_path))
print(td.keys())

friends_s04e23d_aaUt.json
dict_keys(['transcript', 'words', 'sentences'])


In [185]:
# Copy transcript segments before and after them song to remove it from the transcript text
seg_1 = "Well, I've never done that with you before. Nope. So how are you? You okay? Yep. Yep. You? Oh, yes, yes. Uh huh. You. We did you. Well, I better get going. Oh, yeah, absolutely. Could you not look? I don't want to look. Hello? Hey. Where the hell have you been? Hey. I spent the night out. I met this cute bridesmaid. She is so. I. I don't want to hear about her. Aw, Pheeves, you know you're still my number one girl. No, no. We have an emergency. Okay. Rachel's coming to London. Oh, great. Well, it's not great. No, she's coming to tell Ross that she loves him. But he loves Emily. I know that. You have to stop her. She's gonna ruin the wedding. Okay. All right. So. Okay, hold on, hold on. Rachel Cummings, do something. Okay, so then I've done my part. Okay? It's your responsibility now. The burden is off me, right? Right. So tell me about this girl. And so then I realized all the stuff that I had been doing. Proposing to Joshua, lying to Ross about why I couldn't come to the wedding was all just a way of. Oh, oh, oh, oh. I'm sorry, can I interrupt here? I just want to say that you are a horrible, horrible person. Pardon me? You say you love this man and yet you're about to ruin the happiest day of his life. I'm afraid I have to agree with your friend. This is a terrible, terrible plan. But he has to know how I feel. But why? He loves this. This Emily person. No good can come of this. Well, I think you're wrong. Oh, no. Well, he doesn't really love her. I mean, it's just a rebound thing from me. You'll see. Fortunately, I won't. By the. It seems to be perfectly clear that you were on a break. Do something. Something. Hey. Hey. Have you seen Monica? I'm not seeing Monica. Bu. What? What? Look, we. We gotta find her. Phoebe just called. Rachel's coming to tell Ross she loves him. Oh, my God. I know. That's why we gotta find Monica. Do you know where she is? No. Okay, what's with the third degree? Why don't you shine a light in my eyes? Oh, my God. It's like a fairyland. I know. It's horrible, isn't it? Well, I love it. I only hope my wedding looks this good. I just hope you can let some of them go by. Psst. Rick. All right, we really need to start looking out for Rachel. I'll cover the front door. You watch that big hole at the Back of the building. And I got Chandler covering Ross. Why would I care where Chandler is? You know? You know, sometimes I don't even like Chandler. Okay, I do. I do. I do. Oh, yeah, you're right. It's the second one. Right? Hey, Joey. Hey, Felicity. I thought about you all day. Yeah? Talk New York to me again. Forget about it. How you doing? If there's nothing to discuss, we're not paying for your wine cellar. You have to meet me in the middle here. Hey, you keep pushing me on this, my foot's gonna meet the middle of your ass. What's going on? Nothing. Nothing. Everything's under control. You want a piece of me, sir? Is that what you're saying? Want a piece of me? Okay, that's it. Parents. Parents. Back away, all right? This is our wedding day. From now on, everyone gets along. And if I hear one more word, no grandchildren. That's right. Okay, okay. Sorry, old boy. Sorry. Sorry. I could kill you with my thumb, you know. What was all that about? It was this disagreement or. My God, You. You look beautiful. Oh. Oh. You're not meant to see me before the wedding's bad luck. You know what? I think we've had all the bad luck. We're going to it. Thank God. Rachel, you're here. I can't believe it. What happened? Why are you here? Well, I just came. I just needed to tell you congratulations. It's. Hello, Waltham Interiors. Mrs. Waltham, hi, it's Phoebe again. Why? Yeah, can I please, please, please talk to one of the best men? This is gonna be the last time. I promise. Joey, there's a girl on the phone for you. Oh, great. Hello? Joey, it's Phoebe. Did you stop Rachel? No, but it's okay. She just came in and gave him a hug. That's it? Nothing got ruined? No. Oh, that's so great. Oh, so, what's going on now? I'm walking down the aisle. Still walking. I'm about to pass the bridesmaid I hooked up with last night. Hey, I just told her, hey. Okay, now I'm at the front with Ross. It's Phoebe. He looks pretty mad. I better go. No, wait, wait, wait. Oh, please hold. Hold it up so I can listen. What we did last night was stupid. Totally crazy stupid. What were we thinking? I'm coming over tonight though, right? Oh, yeah, Definitely. Friends, family are gathered to celebrate here today the joyous union of Ross and Emily. Be the happiness we share with them today. Be with them always. Now, Emily, repeat after me. I, Emily. I, Emily. Take thee, Ross. Take thee, Ross. As my lawfully wedded husband, in sickness and in health, till death parts us. As my lawfully wedded husband, in sickness and in health, until death parts us. Our Ross. Repeat after me. Ay. Ross. I, Ross, Take thee, Emily. Take thee, Rachel. Emily. Emily. Shall I go on?"
seg_2 = "Mother. Mother. Peter. Peter. Take one. Oh, the band's ready. Well, we gotta do what the band says. I don't care about the stupid band. Well, it's getting late. Here we go. Actually, you guys would mind staying here with me for a while. You know, we gotta get up early and catch that plane to New York. Yeah, it's a very large plane. Why? What's the matter? Sa."

# Sanity check
new_transcript = seg_1 + " " + seg_2
#print("be this way" in new_transcript)
print("no one told" in new_transcript)
print(new_transcript)


False
Well, I've never done that with you before. Nope. So how are you? You okay? Yep. Yep. You? Oh, yes, yes. Uh huh. You. We did you. Well, I better get going. Oh, yeah, absolutely. Could you not look? I don't want to look. Hello? Hey. Where the hell have you been? Hey. I spent the night out. I met this cute bridesmaid. She is so. I. I don't want to hear about her. Aw, Pheeves, you know you're still my number one girl. No, no. We have an emergency. Okay. Rachel's coming to London. Oh, great. Well, it's not great. No, she's coming to tell Ross that she loves him. But he loves Emily. I know that. You have to stop her. She's gonna ruin the wedding. Okay. All right. So. Okay, hold on, hold on. Rachel Cummings, do something. Okay, so then I've done my part. Okay? It's your responsibility now. The burden is off me, right? Right. So tell me about this girl. And so then I realized all the stuff that I had been doing. Proposing to Joshua, lying to Ross about why I couldn't come to the wedding

In [186]:
# Modify transcript with corrected version 
td['transcript'] = new_transcript 


In [136]:
"""
Single word manual spelling fixes
"""

"""
for i in range(192, 196):
    print(i, td["words"][i])

td["words"][193]['word'] = "I'm"

temp_w = {
    'word': 'good.',
    'start': 82.07,
    'end': 82.27,
    'speaker': None,
    'confidence': 0.9873047
}

td["words"] = td["words"][:194] + [temp_w] + td["words"][194:]

print()
for i in range(192, 196):
    print(i, td["words"][i])
"""

#print(td["words"][194:196])

#td["words"][194]['word'] = 'Hey,'
#td["words"][195]['word'] = 'hey.'

#td["words"][194:196]

[{'word': 'hey.', 'start': 118.34, 'end': 118.58, 'speaker': None, 'confidence': 0.84643555}, {'word': 'Hey.', 'start': 118.58, 'end': 118.94, 'speaker': None, 'confidence': 0.98046875}]


[{'word': 'Hey,',
  'start': 118.34,
  'end': 118.58,
  'speaker': None,
  'confidence': 0.84643555},
 {'word': 'hey.',
  'start': 118.58,
  'end': 118.94,
  'speaker': None,
  'confidence': 0.98046875}]

In [187]:
# Remove theme song words

# Pick a rare-ish key word close to theme song onset
key_word = "Shall"

# get key word index
for j, w in enumerate(td["words"]):
    if key_word in w['word']:
        print(j, w['word'])


938 Shall


In [188]:
# copy key word index from above
kw = 938

for j in range(kw, kw+80):
    print(j, td["words"][j]["word"])

938 Shall
939 I
940 go
941 on?
942 No
943 one
944 told
945 you
946 life
947 was
948 gonna
949 be
950 this
951 way.
952 Your
953 job's
954 a
955 joke.
956 You
957 broke
958 your
959 love.
960 Life's
961 the
962 old
963 way.
964 It's
965 like
966 you're
967 always
968 stuck
969 in
970 second
971 gear.
972 When
973 it
974 hasn't
975 been
976 your
977 day,
978 week,
979 month.
980 I'll
981 be
982 there
983 for
984 you.
985 I'll
986 be
987 there
988 for
989 you.
990 Mother.
991 Mother.
992 Peter.
993 Peter.
994 Take
995 one.
996 Oh,
997 the
998 band's
999 ready.
1000 Well,
1001 we
1002 gotta
1003 do
1004 what
1005 the
1006 band
1007 says.
1008 I
1009 don't
1010 care
1011 about
1012 the
1013 stupid
1014 band.
1015 Well,
1016 it's
1017 getting


In [189]:
# Pick boundary indices to remove theme song words
e_1 = 942  # First word of theme song
e_2 = 990  # First word of dialog AFTER the theme song

# sanity check: make sure the theme song words are removed, and nothing else is missing
new_words = td["words"][:e_1] + td["words"][e_2:]

for j in range(kw, kw+80):    
    print(j, new_words[j]["word"])

938 Shall
939 I
940 go
941 on?
942 Mother.
943 Mother.
944 Peter.
945 Peter.
946 Take
947 one.
948 Oh,
949 the
950 band's
951 ready.
952 Well,
953 we
954 gotta
955 do
956 what
957 the
958 band
959 says.
960 I
961 don't
962 care
963 about
964 the
965 stupid
966 band.
967 Well,
968 it's
969 getting
970 late.
971 Here
972 we
973 go.
974 Actually,
975 you
976 guys
977 would
978 mind
979 staying
980 here
981 with
982 me
983 for
984 a
985 while.
986 You
987 know,
988 we
989 gotta
990 get
991 up
992 early
993 and
994 catch
995 that
996 plane
997 to
998 New
999 York.
1000 Yeah,
1001 it's
1002 a
1003 very
1004 large
1005 plane.
1006 Why?
1007 What's
1008 the
1009 matter?
1010 Sa.


IndexError: list index out of range

In [190]:
# Modify words with corrected version 
td["words"] = new_words



In [18]:
"""
Manual sentences edits
"""

#print(td["sentences"][17], td["sentences"][17]['words'][-1])
#print()

#td["sentences"][17]['text'] = 'Goodbye muscles.'
#td["sentences"][17]['words'][-1]['word'] = 'muscles.' 

#print(td["sentences"][17], td["sentences"][17]['words'][-1])

#print(td["sentences"][29])
#print()
#td["sentences"][29]['text'] = "We still need a tip."
#td["sentences"][29]['words'] = td["sentences"][29]['words'][-5:]

#td["sentences"][29]['start'] = td["sentences"][29]['words'][0]['start']
#td["sentences"][29]['words'][0]['word'] = "We"

#print(td["sentences"][29])

"""
print(td["sentences"][44])

td["sentences"][44]['text'] = "I'm good."
td["sentences"][44]['words'] = td["sentences"][44]['words'] + [temp_w] 
td["sentences"][44]['words'][0]['word'] = "I'm"
td["sentences"][44]['end'] = td["sentences"][44]['words'][-1]['end']

print()
print(td["sentences"][44])
"""

"""
for i in range(37, 39):
    print(i, td["sentences"][i])

td["sentences"][37]['text'] = 'Hey, hey.'
td["sentences"][37]['words'] = td["sentences"][37]['words'][1:] + td["sentences"][38]['words']
td["sentences"][37]['start'] = td["sentences"][37]['words'][0]['start']
td["sentences"][37]['end'] = td["sentences"][37]['words'][-1]['end']
td["sentences"][37]['words'][0]['word'] = 'Hey,'
td["sentences"][37]['words'][-1]['word'] = 'hey.'

td["sentences"] = td["sentences"][:38] + td["sentences"][39:]

print()
for i in range(37, 39):
    print(i, td["sentences"][i])
"""

"""
print(td["sentences"][60])

td["sentences"][60]['text'] = "Monica, I'm sorry I'm late."
td["sentences"][60]['words'] = td["sentences"][60]['words'][-5:]
td["sentences"][60]['start'] = td["sentences"][60]['words'][0]['start']

print()
print(td["sentences"][60])
"""

{'text': "It's like you're always stuck in second gear when it hasn't been your day, week, your month We've been your year But I'll be there for you when race starts to pour I'll be there for you Like I've been there before I'll be there for you There for me too Monica, I'm sorry I'm late.", 'start': 122.41, 'end': 157.35, 'speaker': None, 'confidence': 0.9873047, 'words': [{'word': "It's", 'start': 122.41, 'end': 122.77, 'speaker': None, 'confidence': 0.9873047}, {'word': 'like', 'start': 122.77, 'end': 123.01, 'speaker': None, 'confidence': 0.9980469}, {'word': "you're", 'start': 123.01, 'end': 123.45, 'speaker': None, 'confidence': 0.98567706}, {'word': 'always', 'start': 123.45, 'end': 123.77, 'speaker': None, 'confidence': 0.9995117}, {'word': 'stuck', 'start': 123.85, 'end': 124.29, 'speaker': None, 'confidence': 0.91748047}, {'word': 'in', 'start': 124.29, 'end': 124.65, 'speaker': None, 'confidence': 0.9995117}, {'word': 'second', 'start': 124.65, 'end': 125.05, 'speaker': None

In [191]:
# Identify theme song boundaries
for i, s in enumerate(td["sentences"]):
    print(i, s["text"])

0 Well, I've never done that with you before.
1 Nope.
2 So how are you?
3 You okay?
4 Yep.
5 Yep.
6 You?
7 Oh, yes, yes.
8 Uh huh.
9 You.
10 We did you.
11 Well, I better get going.
12 Oh, yeah, absolutely.
13 Could you not look?
14 I don't want to look.
15 Hello?
16 Hey.
17 Where the hell have you been?
18 Hey.
19 I spent the night out.
20 I met this cute bridesmaid.
21 She is so.
22 I. I don't want to hear about her.
23 Aw, Pheeves, you know you're still my number one girl.
24 No, no.
25 We have an emergency.
26 Okay.
27 Rachel's coming to London.
28 Oh, great.
29 Well, it's not great.
30 No, she's coming to tell Ross that she loves him.
31 But he loves Emily.
32 I know that.
33 You have to stop her.
34 She's gonna ruin the wedding.
35 Okay.
36 All right.
37 So.
38 Okay, hold on, hold on.
39 Rachel Cummings, do something.
40 Okay, so then I've done my part.
41 Okay?
42 It's your responsibility now.
43 The burden is off me, right?
44 Right.
45 So tell me about this girl.
46 And so the

In [192]:
# Set theme song boundaries
s_1 = 211  # first sentence of theme song
s_2 = 219   # first sentence AFTER end of theme song


new_sentences = td["sentences"][:s_1] + td["sentences"][s_2:]

# Sanity check: check that theme song sentences are removed
for i in range(s_1-5, s_2):
    print(i, new_sentences[i]["text"])
    

206 I, Ross, Take thee, Emily.
207 Take thee, Rachel.
208 Emily.
209 Emily.
210 Shall I go on?
211 Mother.
212 Mother.
213 Peter.
214 Peter.
215 Take one.
216 Oh, the band's ready.
217 Well, we gotta do what the band says.
218 I don't care about the stupid band.


In [193]:
# Modify sentences with corrected version 
td["sentences"] = new_sentences



In [194]:
# When all looks good, export results to new .json file

with open(out_path.replace('.json', '_ed.json'), "w") as outfile:
    json.dump(td, outfile)
    

In [195]:
# Final sanity check: load edited 
jf = show_transcripts(s, e, "_ed")
len(jf)==0

True

## QCed edited transcripts

s7: 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 (complete and QCed)

s6: 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15a, 17, 18, 19, 20, 21, 22, 23, 24 (complete and QCed)

s5: 01, 02, (03 had the theme song removed), 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23

s4: only 23d has the theme song at the end

In [196]:
# Final sanity checks: load episode w theme song (before edits)
s = "4"
e = "23"
jf = show_transcripts(s, e, verbose=False)
print(f"{len(jf)} segments for s0{s}e{e}: ")  
#assert len(jf) == 1

# For an episode, flag each transcript that has the theme song in it (typically transcript a)
t = 0
out_path = jf[t][0]
td = jf[t][1]
len_seg_1, len_seg_2 = jf[t][2]
print(os.path.basename(out_path))

# load edited transcript for comparison
e_td = prep_aa(out_path.replace('.json', '_ed.json'))


friends_s04e23d_aaUt.json
1 segments for s04e23: 
friends_s04e23d_aaUt.json


In [197]:
# Compare raw transcript segments

"""
print(f"{td['transcript'][:len_seg_1]} \n")
print(f"{e_td['transcript'][:len_seg_1]} \n \n")

print(f"{td['transcript'][len_seg_1:]} \n")
print(f"{e_td['transcript'][len_seg_1:]} \n \n")

"""
print(f"{td['transcript'][:len(seg_1)]} \n")
print(f"{e_td['transcript'][:len(seg_1)]} \n \n")

print(f"{td['transcript'][len(seg_1):]} \n")
print(f"{e_td['transcript'][len(seg_1):]} \n \n")



Well, I've never done that with you before. Nope. So how are you? You okay? Yep. Yep. You? Oh, yes, yes. Uh huh. You. We did you. Well, I better get going. Oh, yeah, absolutely. Could you not look? I don't want to look. Hello? Hey. Where the hell have you been? Hey. I spent the night out. I met this cute bridesmaid. She is so. I. I don't want to hear about her. Aw, Pheeves, you know you're still my number one girl. No, no. We have an emergency. Okay. Rachel's coming to London. Oh, great. Well, it's not great. No, she's coming to tell Ross that she loves him. But he loves Emily. I know that. You have to stop her. She's gonna ruin the wedding. Okay. All right. So. Okay, hold on, hold on. Rachel Cummings, do something. Okay, so then I've done my part. Okay? It's your responsibility now. The burden is off me, right? Right. So tell me about this girl. And so then I realized all the stuff that I had been doing. Proposing to Joshua, lying to Ross about why I couldn't come to the wedding was a

In [198]:
# Check words
skip_words = {"comic", "comet", "kiss", "pa", "mus"}#, "weebs", "brock", "where", "taylor", "shanine", "oil"}
clean_string = str.maketrans('', '', string.punctuation + string.whitespace)

i_0 = 0
i_n = len(td["words"])
j = 0
j_n = len(e_td["words"])

unmatched_words = []
for i in range(i_n-i_0):
    td_w = td["words"][i+i_0]["word"].lower().translate(clean_string)
    if j < j_n:
        etd_w = e_td["words"][j]["word"].lower().translate(clean_string)
        if td_w != etd_w and td_w not in skip_words:
            unmatched_words.append((i+i_0, td_w))
        else: 
            j += 1

for i in range(len(unmatched_words)):
    print(unmatched_words[i])

(942, 'no')
(943, 'one')
(944, 'told')
(945, 'you')
(946, 'life')
(947, 'was')
(948, 'gonna')
(949, 'be')
(950, 'this')
(951, 'way')
(952, 'your')
(953, 'jobs')
(954, 'a')
(955, 'joke')
(956, 'you')
(957, 'broke')
(958, 'your')
(959, 'love')
(960, 'lifes')
(961, 'the')
(962, 'old')
(963, 'way')
(964, 'its')
(965, 'like')
(966, 'youre')
(967, 'always')
(968, 'stuck')
(969, 'in')
(970, 'second')
(971, 'gear')
(972, 'when')
(973, 'it')
(974, 'hasnt')
(975, 'been')
(976, 'your')
(977, 'day')
(978, 'week')
(979, 'month')
(980, 'ill')
(981, 'be')
(982, 'there')
(983, 'for')
(984, 'you')
(985, 'ill')
(986, 'be')
(987, 'there')
(988, 'for')
(989, 'you')


In [199]:
start_k = unmatched_words[0][0] - 5
end_k = unmatched_words[-1][0] + 6
for k in range(start_k, end_k):
    print(td["words"][k]["word"])

Emily.
Shall
I
go
on?
No
one
told
you
life
was
gonna
be
this
way.
Your
job's
a
joke.
You
broke
your
love.
Life's
the
old
way.
It's
like
you're
always
stuck
in
second
gear.
When
it
hasn't
been
your
day,
week,
month.
I'll
be
there
for
you.
I'll
be
there
for
you.
Mother.
Mother.
Peter.
Peter.
Take


In [None]:
#print(td["words"][290:295])

In [200]:
# Check sentences


# Check words
skip_sentences = {
    "ross whens this comic thing start", "ross whens this comet thing start",
    "youre in a great place in your life", "so that guy there straight or gay",
    "but ill be there for the wedding is so close", "there are no words",
    "hello", "mrs brock", "where are you", "come on weebs",
    "its like youre always st why are we in bed together",
    "so is shanine around", "taylor come on", "look in ross defense okay no one told you life was gonna be this way",
    "if it says boil 2 cups of salt you just oil 2 cups of salt",
    "its like youre always stuck in second gear when it hasnt been your day a week a month weve been your year but ill be there for you cause youre there for me too hi",
    "hi", "cuz youre there for you too monica", "if sure", 'my office finally got wrinkle free fax pa',
    'he thought you said go ned', 'goodbye mus', 'hes the one doing your',
    'when it hasnt been your day week month weve been your year be there for you ill be there for you like ive been there before ill be there for you we still need a tip',
    "im"
}
clean_string = str.maketrans('', '', string.punctuation)

i_0 = 0
i_n = len(td["sentences"])
j = 0
j_n = len(e_td["sentences"])

unmatched_sentences = []
for i in range(i_n-i_0):
    td_s = td["sentences"][i+i_0]["text"].lower().translate(clean_string)
    if j < j_n:
        etd_s = e_td["sentences"][j]["text"].lower().translate(clean_string)
        if td_s != etd_s and td_s not in skip_sentences:
            unmatched_sentences.append((i+i_0, td_s))
        else: 
            j += 1

for i in range(len(unmatched_sentences)):
    print(unmatched_sentences[i])

(211, 'no one told you life was gonna be this way')
(212, 'your jobs a joke')
(213, 'you broke your love')
(214, 'lifes the old way')
(215, 'its like youre always stuck in second gear')
(216, 'when it hasnt been your day week month')
(217, 'ill be there for you')
(218, 'ill be there for you')


In [201]:
start_k = unmatched_sentences[0][0] - 2
end_k = unmatched_sentences[-1][0] + 3
for k in range(start_k, end_k):
    print(td["sentences"][k]["text"])

Emily.
Shall I go on?
No one told you life was gonna be this way.
Your job's a joke.
You broke your love.
Life's the old way.
It's like you're always stuck in second gear.
When it hasn't been your day, week, month.
I'll be there for you.
I'll be there for you.
Mother.
Mother.


In [202]:
start_k = unmatched_words[0][0] - 5
end_k = unmatched_words[-1][0] + 6
for k in range(start_k, end_k):
    print(e_td["words"][k]["word"])

Emily.
Shall
I
go
on?
Mother.
Mother.
Peter.
Peter.
Take
one.
Oh,
the
band's
ready.
Well,
we
gotta
do
what
the
band
says.
I
don't
care
about
the
stupid
band.
Well,
it's
getting
late.
Here
we
go.
Actually,
you
guys
would
mind
staying
here
with
me
for
a
while.
You
know,
we
gotta
get
up
early
and
catch


In [203]:
start_k = unmatched_sentences[0][0] - 2
end_k = unmatched_sentences[-1][0] + 3
for k in range(start_k, end_k):
    print(e_td["sentences"][k]["text"])

Emily.
Shall I go on?
Mother.
Mother.
Peter.
Peter.
Take one.
Oh, the band's ready.
Well, we gotta do what the band says.
I don't care about the stupid band.
Well, it's getting late.
Here we go.
