In [61]:
import pysubs2
import spacy
import pandas as pd
import numpy as np
import os.path
import pycrfsuite

from collections import Counter

from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
nlp_en = spacy.load('en')
nlp_fr = spacy.load('fr')

In [3]:
def load_data():
    if os.path.exists('bcbc_fr_parsed') and \
        os.path.exists('bcbc_en_parsed.bin'):
        return Doc(Vocab()).from_disk('bcbc_fr_parsed.bin'), \
                    Doc(Vocab()).from_disk('bcbc_en_parsed.bin')
    else:  
        subs_french = pysubs2.load('bon_french/Bon.Cop.Bad.Cop.2006.720p.BluRay.H264.AAC-RARBG.srt',
                            encoding='iso-8859-1')
        subs_eng = pysubs2.load('bon_eng/Bon.Cop.Bad.Cop.2006.720p.BluRay.x264-.YTS.AG.srt')
        french_subs = " ".join(line.text for line in subs_french)
        french_parsed = nlp_fr(french_subs)

        eng_subs = " ".join(line.text for line in subs_eng)
        eng_parsed = nlp_en(eng_subs)

        french_parsed.to_disk('bcbc_fr_parsed.bin')
        eng_parsed.to_disk('bcbc_en_parsed.bin')

        return french_subs, french_parsed, eng_subs, eng_parsed

In [4]:
french_subs, french_parsed, eng_subs, eng_parsed = load_data()

In [5]:
[x.tag_ for x in french_parsed]
# TODO: Get tokens from each script and align them somehow
#       Maybe they don't have to be aligned, just use indexing to get
#       "corresponding" token from other script
#       Keep track of current script and switch according to prob_matrix 
#       below, choose one monolingual script for starters

['PUNCT___',
 'PRON__Number=Sing|Person=3',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'ADP___',
 'DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Art',
 'ADJ__Number=Sing',
 'ADV___',
 'PRON__Number=Sing|Person=3',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'ADP___',
 'PROPN__Gender=Masc|Number=Sing',
 'PUNCT___',
 'PROPN__Gender=Masc|Number=Sing',
 'PUNCT___',
 'NOUN__Gender=Masc|Number=Sing',
 'PROPN___',
 'PUNCT___',
 'PROPN___',
 'PUNCT___',
 'ADV__PronType=Int',
 'PRON__Gender=Masc|Number=Sing|Person=3|PronType=Dem',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'PUNCT___',
 'PUNCT___',
 'NOUN__Number=Plur',
 'PUNCT___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV___',
 'PUNCT___',
 'PUNCT___',
 'PRON__PronType=Int',
 'PRON__Number=Sing|Person=3',
 'AUX__Mood=Ind|Number=Sing|P

In [6]:
bcbc_gs = pd.read_excel('BCBC_GS_annotator1.xlsx')

    eng frn
eng
frn 

In [7]:
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'] != 'French/English']
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'] != "None"]
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'].notnull()]
bcbc_gs['Listened Language'].value_counts()

French     7650
English    5803
Name: Listened Language, dtype: int64

In [8]:
def prob_matrix(tokens, langs):
    """
    tokens: a pandas series containing all the tokens
    """
    
    switches = {lang: {}  for lang in langs}
    counts = Counter(zip(tokens, tokens[1:]))

    for (x, y), c in counts.items():
        switches[x][y] = c / float(len(tokens) - 1)
        
    return switches

In [9]:
prob = prob_matrix(bcbc_gs['Listened Language'], \
                             set(bcbc_gs['Listened Language']))
prob

{'French': {'French': 0.5489146595301814, 'English': 0.01977401129943503},
 'English': {'French': 0.019699672911091287, 'English': 0.4116116562592923}}

In [10]:
fr_subs = french_subs.split(" ")
en_subs = eng_subs.split(" ")
len(en_subs), len(fr_subs)


(9589, 10225)

In [11]:
def lang_switcher(corpora, langs, start_lang):
    last_lang = start_lang
    last_index = langs.index(start_lang)
    cs_script = cs_script = corpora[last_lang][0] + " "
    
    for index, word in enumerate(en_subs):
        r = np.random.uniform(low=0.0, high=1.0)
        last_lang = langs[last_index]
    
        # Check if we switch
        if r > prob[last_lang][last_lang]:
            # Switch and get next word from other corpus
            last_index = last_index ^ 1
            last_lang = langs[last_index]
        
        cs_script += corpora[last_lang][index] + " "

    return cs_script

In [12]:
corpora = {"English": en_subs, "French": fr_subs}
langs = ["English", "French"]

eng_cs = lang_switcher(corpora, langs, "English")
fr_cs = lang_switcher(corpora, langs, "French")

# TODO: Pretty print these to remove all the slashes since it looks terrible
#       Maybe separate into sentences?
#       Try/catch block for last line since corpora are different lengths
#       Find itertools function or something instead of using XOR for index

In [13]:
targets = []

for lang in bcbc_gs["Listened Language"]:
    if lang == "French":
        targets.append(0)
    elif lang == "English":
        targets.append(1)
    else:
        targets.append(0)

In [14]:
sentences = []
targets = []

sent = []
sent_targets = []

for lang, token in zip(bcbc_gs["Listened Language"], bcbc_gs["Token"]):
    sent.append(token)
    
    if lang == "French":
        sent_targets.append(0)
    elif lang == "English":
        sent_targets.append(1)
    else:
        sent_targets.append(0)
        
    if token in (".", "?", "!"):
        sentences.append(sent)
        targets.append(sent_targets)
        sent = []
        sent_targets = []
        


In [15]:
import pycrfsuite

In [16]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [17]:
text = " ".join([str(tok) for tok in bcbc_gs["Token"]])

In [18]:
mapped_targets = [{w: t for w, t in zip(sentence, target)} for sentence, target in zip(sentences, targets)]

## Create inputs and outputs
- Get the pos tag for each word and make sure that each word has a target
- This might not be the best way of handling this 
- The current

In [19]:
y = []
x = []

for text, targets in zip(sentences, mapped_targets):
    _y = []
    sentence = []
    
    for word in nlp_fr(" ".join(list(map(lambda w: str(w), text)))):
        sentence.append((word.text, word.pos_))
        
        if word.text in targets:
            _y.append(targets[word.text])
        else:
            _y.append(0)        
    x.append(sentence)
    y.append(_y)

In [29]:
X = [sent2features(sent) for sent in x]
y = list(map(lambda _y: list(map(lambda x: str(x), _y)), y))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [33]:
%%time

trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 47.3 ms, sys: 2.27 ms, total: 49.5 ms
Wall time: 48.3 ms


In [34]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [35]:
%%time
trainer.train('bon_cop_bad_cop.crfsuite')

CPU times: user 115 ms, sys: 3.52 ms, total: 118 ms
Wall time: 117 ms


In [36]:
tagger = pycrfsuite.Tagger()
tagger.open('bon_cop_bad_cop.crfsuite')

<contextlib.closing at 0x10d6b3748>

## Test CRF

In [48]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 6.25 ms, sys: 240 µs, total: 6.49 ms
Wall time: 6.33 ms


In [88]:
mb = MultiLabelBinarizer()

In [89]:
mb.fit(y_test)

MultiLabelBinarizer(classes=None, sparse_output=False)

- Sklearn no longer lets you pass sequences of sequences so I had to binarize the test and preds

In [90]:
y_test = mb.transform(y_test)

In [91]:
y_pred = mb.transform(y_pred)

In [92]:
print(accuracy_score(y_test, y_pred))

0.9012345679012346


In [93]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
1      -> 1       1.242809
0      -> 0       1.046062
0      -> 1       0.227935
1      -> 0       -2.300002

Top unlikely transitions:
1      -> 1       1.242809
0      -> 0       1.046062
0      -> 1       0.227935
1      -> 0       -2.300002


In [94]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
8.888469 0      0
7.565034 0      t
7.343098 1      lf
7.209582 0      s
6.443917 1      We
5.935603 1      I
5.308081 0      ve
5.291185 1      He
5.255368 0      re
5.181789 1      ”
5.022645 1      you
4.833435 0      I'
4.223305 1      Mr
4.216633 1      You
4.204534 0      m
3.979353 0      don'
3.935316 1      here
3.774906 1      So
3.748283 0      -
3.728382 1      to

Top negative:
-0.697433 1      PUNCT
-0.726052 1      en
-0.785143 0      my
-0.815485 0      not
-0.821758 0      PROPN
-0.901163 0      out
-0.962907 0      it
-0.973459 0      X
-1.031253 1      PRON
-1.224747 0      on
-1.293551 1      DET
-1.310123 0      Fuck
-1.482677 0      me
-1.538801 0      Yeah
-1.596445 0      Oh
-1.635401 0      him
-1.660978 0      Hey
-1.715645 0      Okay
-1.839059 0      ...
-2.309248 0      up
