# Level A : get the most common lexical bundles

In [1]:
import re
from collections import Counter
import numpy as np

In [2]:
with open('ocw.1m', 'r', encoding='utf8') as f:
    text = f.read()

In [3]:
def words(text):
    return re.findall(r'\w+', text.lower())

In [4]:
def four_gram(text):
    four_words = []
    word = words(text)
    for i in range(len(word)-3):
        four_words.append(word[i] + ' ' + word[i+1] + ' '  + word[i+2] + ' '  + word[i+3])
        
    return four_words

In [5]:
lexical_bundles = Counter(four_gram(text))

In [6]:
constraint = 40 / 1e6
N = len(words(text))

del_key = []
for k,v in lexical_bundles.items():
    if v/N <= constraint:
        del_key.append(k)

In [7]:
for i in range(len(del_key)):
    del lexical_bundles[del_key[i]]

In [8]:
lexical_bundles.most_common(20)

[('i m going to', 586),
 ('we re going to', 486),
 ('you re going to', 204),
 ('it s going to', 200),
 ('s going to be', 186),
 ('the end of the', 167),
 ('is going to be', 162),
 ('i don t know', 150),
 ('they re going to', 137),
 ('00 00 00 professor', 124),
 ('at the end of', 117),
 ('end of transcript chapter', 116),
 ('of transcript chapter 1', 116),
 ('professor douglas w rae', 116),
 ('at the same time', 113),
 ('and you can see', 109),
 ('that s going to', 99),
 ('i think it s', 96),
 ('you can see that', 94),
 ('re going to be', 91)]

# Level B : get the better lexical bundles

In [9]:
# grammar

In [10]:
with open('ocw.pos.1m', 'r', encoding='utf8') as f:
    grammar = f.read().splitlines()
    for i in range(len(grammar)):
        grammar[i] = ' '.join(words(grammar[i]))
        grammar[i] = grammar[i].replace(' punct','').replace('punct ','')

In [11]:
good_grammar = ['VERB VERB PART VERB', 'ADP PRON VERB ADP', 'DET NOUN ADP DET', 'ADP DET NOUN ADP', 'ADP NOUN ADP DET']

In [12]:
del_key = []
for k, v in lexical_bundles.items():
    pos = [i for i, s in enumerate(grammar) if k in s]
    if len(pos) > 0:
        try:
            ind = four_gram(grammar[pos[0]]).index(k)
            part = four_gram(grammar[pos[0]+1])[ind]
            if part.upper() not in good_grammar:
                del_key.append(k)
        except:
            continue

In [13]:
for i in range(len(del_key)):
    del lexical_bundles[del_key[i]]

In [14]:
lexical_bundles.most_common(20)

[('s going to be', 186),
 ('the end of the', 167),
 ('is going to be', 162),
 ('i don t know', 150),
 ('at the end of', 117),
 ('end of transcript chapter', 116),
 ('of transcript chapter 1', 116),
 ('re going to be', 91),
 ('you don t have', 89),
 ('the way in which', 88),
 ('don t want to', 87),
 ('don t have to', 83),
 ('if you look at', 80),
 ('i don t think', 65),
 ('the rest of the', 62),
 ('in the case of', 58),
 ('are going to be', 58),
 ('it s not a', 54),
 ('re going to have', 54),
 ('i don t want', 54)]

In [15]:
# idioms

In [16]:
with open('oxford.4gram.txt', 'r', encoding='utf8') as f:
    idioms = f.read().splitlines()

In [17]:
idioms_list = []
for i in idioms:
    i = i.split('\t')
    idioms_list.append(i[0])

In [18]:
rm_key = []
for k, v in lexical_bundles.items():
    if k in idioms_list:
        rm_key.append(k)

In [19]:
for i in range(len(rm_key)):
    del lexical_bundles[rm_key[i]]

In [20]:
lexical_bundles.most_common(20)

[('s going to be', 186),
 ('is going to be', 162),
 ('i don t know', 150),
 ('end of transcript chapter', 116),
 ('of transcript chapter 1', 116),
 ('re going to be', 91),
 ('you don t have', 89),
 ('don t want to', 87),
 ('don t have to', 83),
 ('if you look at', 80),
 ('i don t think', 65),
 ('in the case of', 58),
 ('it s not a', 54),
 ('re going to have', 54),
 ('i don t want', 54),
 ('s one of the', 52),
 ('if you don t', 51),
 ('re going to get', 49),
 ('what s going to', 48),
 ('m going to do', 46)]

In [21]:
# distribution: at least 5 in 100

In [22]:
chunk_size = int(N / 100)
total_text = [words(text)[x:x+chunk_size] for x in range(0, N, chunk_size)]
total_text = [' '.join(x) for x in total_text]

In [23]:
del_key = []
for k, v in lexical_bundles.items():
    count = 0
    for t in total_text:
        if k in t:
            count += 1
    if count < 5:
        del_key.append(k)

In [24]:
for i in range(len(del_key)):
    del lexical_bundles[del_key[i]]

In [25]:
lexical_bundles.most_common(20)

[('s going to be', 186),
 ('is going to be', 162),
 ('i don t know', 150),
 ('end of transcript chapter', 116),
 ('of transcript chapter 1', 116),
 ('re going to be', 91),
 ('you don t have', 89),
 ('don t want to', 87),
 ('don t have to', 83),
 ('if you look at', 80),
 ('i don t think', 65),
 ('in the case of', 58),
 ('it s not a', 54),
 ('re going to have', 54),
 ('i don t want', 54),
 ('s one of the', 52),
 ('if you don t', 51),
 ('re going to get', 49),
 ('what s going to', 48),
 ('m going to do', 46)]