In [1]:
import captions, task
import json, spacy
from itertools import accumulate
from spacy import displacy, Docs
nlp = spacy.load("en_core_web_sm")

In [2]:
stt_task = task.get_tasks()[0]
stt_json_path = f'{task.get_task_path(stt_task)}/{task.query(stt_task)[1]}'
stt = {}
with open(stt_json_path) as f_stt:
    stt = json.loads(f_stt.read())

In [3]:
[i['DisplayText'] for i in stt[:5]]

['Much more thank you very much for joining us today.',
 "That's good to be here. This is a broadcast first of course, the first interview with the serving chief of MI 6 and it wasn't that long ago that your name was classified let alone where you actually work that was also a state secret, so I suppose my first question has to be.",
 'What are you doing here in a radio studio doing an interview?',
 "Well, it's a good question, and the secret in the title of Secret Intelligence Service is going to remain. There are clearly. Some some things that II won't be able to discuss but over the last 10 years or so my predecessors have gradually tiptoed into into the light a bit and the reason they've done it and why I'm keen to.",
 "Carry on with it and develop it a bit by coming on to the radio with you today is because I think it's important that people understand the role that we play in so far as we can talk about it in keeping people safe on the streets of the UK and in promoting our inter

In [4]:
stt[1]['NBest'][0]['Words']

[{'Word': "that's", 'Offset': 33500000, 'Duration': 2700000},
 {'Word': 'good', 'Offset': 36300000, 'Duration': 1400000},
 {'Word': 'to', 'Offset': 37800000, 'Duration': 800000},
 {'Word': 'be', 'Offset': 38700000, 'Duration': 1500000},
 {'Word': 'here', 'Offset': 40300000, 'Duration': 2300000},
 {'Word': 'this', 'Offset': 43900000, 'Duration': 2700000},
 {'Word': 'is', 'Offset': 46700000, 'Duration': 1100000},
 {'Word': 'a', 'Offset': 47900000, 'Duration': 2100000},
 {'Word': 'broadcast', 'Offset': 50300000, 'Duration': 6200000},
 {'Word': 'first', 'Offset': 56600000, 'Duration': 3000000},
 {'Word': 'of', 'Offset': 59700000, 'Duration': 700000},
 {'Word': 'course', 'Offset': 60500000, 'Duration': 2700000},
 {'Word': 'the', 'Offset': 63300000, 'Duration': 800000},
 {'Word': 'first', 'Offset': 64200000, 'Duration': 2100000},
 {'Word': 'interview', 'Offset': 66400000, 'Duration': 2700000},
 {'Word': 'with', 'Offset': 69200000, 'Duration': 1000000},
 {'Word': 'the', 'Offset': 70300000, 'D

In [5]:
def get_word_at_charidx(words: list[dict], charidx: str) -> str:
    cum_lens = [0] + list(accumulate([len(word['Word']) + 1 for word in words[:-1]] + [len(words[-1])]))
    word = [word for idx, word in enumerate(words) if cum_lens[idx] <= charidx < cum_lens[idx+1]]
    return word

In [6]:
def get_sentences_with_offsets(stt: dict):
    docs_words = [(nlp(d['DisplayText']), d['NBest'][0]['Words'] if 'Words' in d['NBest'][0] else []) for d in stt]
    sents_words = [(s, dw[1]) for dw in docs_words for s in list(dw[0].sents)]
    return [[(t, get_word_at_charidx(sw[1], t.idx)) for t in sw[0]] for sw in sents_words]

In [7]:
sents = get_sentences_with_offsets(stt)

In [8]:
def sent_to_frags(sents: list):
    for s in sents:
        frag = []
        for idx, (token, offset) in enumerate(s):
            frag.append((token, offset))
            split = False
            if token.dep_ is 'punct':
                split = True
            elif idx != len(s) - 1 and s[idx+1][0].pos_ in ['CCONJ', 'SCONJ']:
                split = True
            elif idx != len(s) - 1 and s[idx+1][0].pos_ in ['AUX', 'PART', 'PROPN', 'PRON'] and len(frag) > 6:
                split = True
            elif idx != len(s) - 1 and s[idx+1][0].pos_ in ['ADP'] and len(frag) > 10:
                split = True
            if token.dep_ is 'det' or token.pos_ is 'DET':
                split = False
            if split:
                yield frag
                frag = []
        if len(frag):
            yield frag


In [9]:
def frags_to_clauses(frags: list):
    MIN_LEN = 5
    MAX_LEN = 16
    completed_loop = False
    while not completed_loop:
        for idx, f in enumerate(frags):
            completed_loop = idx == len(frags) - 1
            if len(frags[idx]) < MIN_LEN:
                frag_text = [t.text for (t,o) in frags[idx]]
                frag_len = len(frags[idx])
                punct_in_prev = any([t.dep_ is 'punct' for (t,o) in frags[idx-1]]) if idx > 0 else True
                frag_len_prev = len(frags[idx-1][0]) if idx > 0 else MAX_LEN + 1
                frag_len_next = len(frags[idx+1][0]) if idx < len(frags) -1 else MAX_LEN + 1
                prev_available = idx > 0 and (frag_len_prev + frag_len <= MAX_LEN) and not punct_in_prev
                next_available = frag_len + frag_len_next <= MAX_LEN and '.' not in frag_text
                if idx is 0 and frags[idx][-1][0].dep_ is 'punct':
                    pass
                elif prev_available:
                    frags[idx-1] = frags[idx-1] + frags[idx]
                    del frags[idx]
                    break
                elif next_available:
                    frags[idx+1] = frags[idx] + frags[idx+1]
                    del frags[idx]
                    break  
    return frags



In [10]:
def clauses_to_captions(clauses: list):
    for c in clauses:
        caption_nlp = nlp([t for (t,o) in c])
        caption_text = caption_nlp.text
        caption_start = c[0][1]['Offset']
        caption_end = c[-1][1]['Offset'] + c[-1][1]['Duration']
        yield {
            'text': caption_text,
            'start': caption_start,
            'end': caption_end
        }

In [11]:
frags = list(sent_to_frags(sents))
[(idx, ' '.join([token.text for (token, offset) in f])) for idx, f in enumerate(frags)]

[(0, 'Much more thank you very much for joining'),
 (1, 'us today .'),
 (2, "That 's good to be here ."),
 (3, 'This is a broadcast first of course ,'),
 (4, 'the first interview with the serving chief of'),
 (5, 'MI 6'),
 (6, "and it was n't that long ago"),
 (7, 'that your name was classified let alone'),
 (8, 'where you actually work that was also a state secret ,'),
 (9, 'so I suppose my first question has'),
 (10, 'to be .'),
 (11, 'What are you doing here in a radio studio doing an interview ?'),
 (12, 'Well ,'),
 (13, "it 's a good question ,"),
 (14, 'and the secret in the title of'),
 (15, 'Secret Intelligence Service is going to remain .'),
 (16, 'There are clearly .'),
 (17, "Some some things that II wo n't"),
 (18, 'be able to discuss'),
 (19, 'but over the last 10 years'),
 (20,
  'or so my predecessors have gradually tiptoed into into the light a bit'),
 (21, "and the reason they 've done it"),
 (22, 'and'),
 (23, "why I 'm keen to ."),
 (24, 'Carry on with it'),
 (25, 'a

In [12]:
frags[1]

[(us, [{'Word': 'us', 'Offset': 17000000, 'Duration': 3700000}]),
 (today, [{'Word': 'today', 'Offset': 21000000, 'Duration': 5400000}]),
 (., [])]

In [13]:
clauses = frags_to_clauses(frags)
[(idx, ' '.join([token.text for (token, offset) in f])) for idx, f in enumerate(clauses)]

[(0, 'Much more thank you very much for joining us today .'),
 (1, "That 's good to be here ."),
 (2, 'This is a broadcast first of course ,'),
 (3, 'the first interview with the serving chief of MI 6'),
 (4, "and it was n't that long ago"),
 (5, 'that your name was classified let alone'),
 (6, 'where you actually work that was also a state secret ,'),
 (7, 'so I suppose my first question has to be .'),
 (8, 'What are you doing here in a radio studio doing an interview ?'),
 (9, "Well , it 's a good question ,"),
 (10, 'and the secret in the title of'),
 (11, 'Secret Intelligence Service is going to remain .'),
 (12, 'There are clearly .'),
 (13, "Some some things that II wo n't be able to discuss"),
 (14, 'but over the last 10 years'),
 (15,
  'or so my predecessors have gradually tiptoed into into the light a bit'),
 (16, "and the reason they 've done it and"),
 (17, "why I 'm keen to ."),
 (18,
  'Carry on with it and develop it a bit by coming on to the radio with you today is'),
 

In [14]:
captions = clauses_to_captions(clauses)
list(captions)

ValueError: [E866] Expected a string or 'Doc' as input, but got: <class 'list'>.

In [None]:
clause = ' '.join([t.text for (t,o) in frags[17] + frags[18]])
clause_doc = nlp(clause)
displacy.serve(clause_doc, style='dep', options = { 'compact': True, 'jupyter': True })# 


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
long_clauses = [f for f in frags if len(f) > 14]
len(long_clauses)

1

In [None]:
clause = ' '.join([t.text for (t,o) in long_clauses[0]])
clause_doc = nlp(clause)
displacy.serve(clause_doc, style='dep', options = { 'compact': True, 'jupyter': True })# 


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
clause = ' '.join([t.text for (t,o) in long_clauses[0]])
clause_doc = nlp(clause)
displacy.serve(clause_doc, style='dep', options = { 'compact': True, 'jupyter': True })# 


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
clause = ' '.join([t.text for (t,o) in long_clauses[0]])
clause_doc = nlp(clause)
displacy.serve(clause_doc, style='dep', options = { 'compact': True, 'jupyter': True })# 


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
