In [169]:
import pysubs2
import spacy
import pandas as pd
import numpy as np
import os.path
import pycrfsuite

from collections import Counter

from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab

from sklearn.metrics import accuracy_score, f1_score, classification_report, recall_score, precision_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MultiLabelBinarizer


In [11]:
nlp_en = spacy.load('en')
nlp_fr = spacy.load('fr')

In [12]:
def load_data():
    if os.path.exists('bcbc_fr_parsed') and \
        os.path.exists('bcbc_en_parsed.bin'):
        return Doc(Vocab()).from_disk('bcbc_fr_parsed.bin'), \
                    Doc(Vocab()).from_disk('bcbc_en_parsed.bin')
    else:  
        subs_french = pysubs2.load('bon_french/Bon.Cop.Bad.Cop.2006.720p.BluRay.H264.AAC-RARBG.srt',
                            encoding='iso-8859-1')
        subs_eng = pysubs2.load('bon_eng/Bon.Cop.Bad.Cop.2006.720p.BluRay.x264-.YTS.AG.srt')
        french_subs = " ".join(line.text for line in subs_french)
        french_parsed = nlp_fr(french_subs)

        eng_subs = " ".join(line.text for line in subs_eng)
        eng_parsed = nlp_en(eng_subs)

        french_parsed.to_disk('bcbc_fr_parsed.bin')
        eng_parsed.to_disk('bcbc_en_parsed.bin')

        return french_subs, french_parsed, eng_subs, eng_parsed

In [13]:
french_subs, french_parsed, eng_subs, eng_parsed = load_data()

In [14]:
[x.tag_ for x in french_parsed]
# TODO: Get tokens from each script and align them somehow
#       Maybe they don't have to be aligned, just use indexing to get
#       "corresponding" token from other script
#       Keep track of current script and switch according to prob_matrix 
#       below, choose one monolingual script for starters

['PUNCT___',
 'PRON__Number=Sing|Person=3',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'ADP___',
 'DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Art',
 'ADJ__Number=Sing',
 'ADV___',
 'PRON__Number=Sing|Person=3',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'ADP___',
 'PROPN__Gender=Masc|Number=Sing',
 'PUNCT___',
 'PROPN__Gender=Masc|Number=Sing',
 'PUNCT___',
 'NOUN__Gender=Masc|Number=Sing',
 'PROPN___',
 'PUNCT___',
 'PROPN___',
 'PUNCT___',
 'ADV__PronType=Int',
 'PRON__Gender=Masc|Number=Sing|Person=3|PronType=Dem',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'PUNCT___',
 'PUNCT___',
 'NOUN__Number=Plur',
 'PUNCT___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV___',
 'PUNCT___',
 'PUNCT___',
 'PRON__PronType=Int',
 'PRON__Number=Sing|Person=3',
 'AUX__Mood=Ind|Number=Sing|P

In [15]:
bcbc_gs = pd.read_excel('BCBC_GS_annotator1.xlsx')

    eng frn
eng
frn 

In [16]:
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'] != 'French/English']
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'] != "None"]
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'].notnull()]
bcbc_gs['Listened Language'].value_counts()

French     7650
English    5803
Name: Listened Language, dtype: int64

In [17]:
def prob_matrix(tokens, langs):
    """
    tokens: a pandas series containing all the tokens
    """
    
    switches = {lang: {}  for lang in langs}
    counts = Counter(zip(tokens, tokens[1:]))

    for (x, y), c in counts.items():
        switches[x][y] = c / float(len(tokens) - 1)
        
    return switches

In [18]:
prob = prob_matrix(bcbc_gs['Listened Language'], \
                             set(bcbc_gs['Listened Language']))
prob

{'French': {'French': 0.5489146595301814, 'English': 0.01977401129943503},
 'English': {'French': 0.019699672911091287, 'English': 0.4116116562592923}}

In [19]:
fr_subs = french_subs.split(" ")
en_subs = eng_subs.split(" ")
len(en_subs), len(fr_subs)


(9589, 10225)

In [28]:
def lang_switcher(corpora, langs, start_lang):
    last_lang = start_lang
    last_index = langs.index(start_lang)
    cs_script = cs_script = corpora[last_lang][0] + " "
    
    for index, word in enumerate(en_subs):
        r = np.random.uniform(low=0.0, high=1.0)
        last_lang = langs[last_index]
    
        # Check if we switch
        if r > prob[last_lang][last_lang]:
            # Switch and get next word from other corpus
            last_index = last_index ^ 1
            last_lang = langs[last_index]
        
        cs_script += corpora[last_lang][index] + " "

    return cs_script

In [29]:
corpora = {"English": en_subs, "French": fr_subs}
langs = ["English", "French"]

eng_cs = lang_switcher(corpora, langs, "English")
fr_cs = lang_switcher(corpora, langs, "French")

# TODO: Pretty print these to remove all the slashes since it looks terrible
#       Maybe separate into sentences?
#       Try/catch block for last line since corpora are different lengths
#       Find itertools function or something instead of using XOR for index

In [30]:
targets = []

for lang in bcbc_gs["Listened Language"]:
    if lang == "French":
        targets.append(0)
    elif lang == "English":
        targets.append(1)
    else:
        targets.append(0)

In [43]:
sentences = []
targets = []

sent = []
sent_targets = []

for lang, token in zip(bcbc_gs["Listened Language"], bcbc_gs["Token"]):
    sent.append(token)
    
    if lang == "French":
        sent_targets.append(0)
    elif lang == "English":
        sent_targets.append(1)
    else:
        sent_targets.append(0)
        
    if token in (".", "?", "!"):
        sentences.append(sent)
        targets.append(sent_targets)
        sent = []
        sent_targets = []
        


In [44]:
import pycrfsuite

In [45]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [46]:
text = " ".join([str(tok) for tok in bcbc_gs["Token"]])

In [47]:
mapped_targets = [{w: t for w, t in zip(sentence, target)} for sentence, target in zip(sentences, targets)]

## Create inputs and outputs
- Get the pos tag for each word and make sure that each word has a target
- This might not be the best way of handling this 
- The current

In [125]:
y = []
x = []

for text, targets in zip(sentences, mapped_targets):
    _y = []
    sentence = []
    
    for word in nlp_fr(" ".join(list(map(lambda w: str(w), text)))):
        sentence.append((word.text, word.pos_))
        
        if word.text in targets:
            _y.append(targets[word.text])
        else:
            _y.append(0)        
    x.append(sentence)
    y.append(_y)

In [172]:
X = [sent2features(sent) for sent in x]
y = list(map(lambda _y: list(map(lambda x: str(x), _y)), y))

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [196]:
kf = KFold(n_splits=5)

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print("Fold : {} ".format(i + 1))
    X = np.array(X)
    y = np.array(y)
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = X[train_index], y[test_index]
    X_train = [X[i] for i in train_index]
    y_train = [y[i] for i in train_index]

    X_test = [X[i] for i in test_index]

    y_test = [y[i] for i in test_index]


    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)
        
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    
    %%time
    trainer.train('bon_cop_bad_cop.crfsuite')
    
    tagger = pycrfsuite.Tagger()
    tagger.open('bon_cop_bad_cop.crfsuite')
    
    %%time
    y_pred = [tagger.tag(xseq) for xseq in X_test]
    
    y_test_flat = list(map(lambda x: int(x), itertools.chain.from_iterable(y_test)))
    y_pred_flat = list(map(lambda x: int(x), itertools.chain.from_iterable(y_pred)))
    
    print(classification_report(y_test_flat, y_pred_flat))
    print(f1_score(y_test_flat, y_pred_flat))
    print(recall_score(y_test_flat, y_pred_flat))
    print(precision_score(y_test_flat, y_pred_flat))

Fold : 1 
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 7.15 µs
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.72 µs
             precision    recall  f1-score   support

          0       0.97      0.99      0.98      2473
          1       0.95      0.91      0.93       800

avg / total       0.97      0.97      0.97      3273

0.9302623160588612
0.90875
0.9528178243774574
Fold : 2 
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs
CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 12.2 µs
             precision    recall  f1-score   support

          0       0.99      0.98      0.98      2674
          1       0.85      0.92      0.88       358

avg / total       0.97      0.97      0.97      3032

0.8844086021505375
0.9189944134078212
0.8523316062176166
Fold : 3 
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs
CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.82 µs
             precision    recall  f1-score   

In [201]:
for x, y in zip(X_test, y_pred):
    for x_sent, y_sent in zip(x, y):
        print(x_sent[1], y_sent)

word.lower=le 0
word.lower=gars 0
word.lower=.. 0
word.lower=0 0
word.lower=c 0
word.lower=' 0
word.lower=est 0
word.lower=une 0
word.lower=fille 0
word.lower=0 0
word.lower=j 0
word.lower=' 0
word.lower=avais 0
word.lower=pas 0
word.lower=le 0
word.lower=match 0
word.lower=d 0
word.lower=' 0
word.lower=adn 0
word.lower=, 0
word.lower=mais 0
word.lower=j 0
word.lower=' 0
word.lower=ai 0
word.lower=reçu 0
word.lower=son 0
word.lower=dossier 0
word.lower=dentaire 0
word.lower=, 0
word.lower=puis 0
word.lower=c 0
word.lower=' 0
word.lower=est 0
word.lower=ça 0
word.lower=0 0
word.lower=je 0
word.lower=lui 0
word.lower=ai 0
word.lower=donné 0
word.lower=une 0
word.lower=brosse 0
word.lower=à 0
word.lower=dents 0
word.lower=puis 0
word.lower=une 0
word.lower=surprise 0
word.lower=0 0
word.lower=0 0
word.lower=elle 0
word.lower=est 0
word.lower=super 0
word.lower=contente 0
word.lower=0 0
word.lower=0 0
word.lower=jeff 0
word.lower=, 0
word.lower=c 0
word.lower=' 0
word.lower=est 0
word.lowe

word.lower=you 1
word.lower=? 1
word.lower=0 0
word.lower=buttman 1
word.lower=? 1
word.lower=0 0
word.lower=what' 0
word.lower=s 0
word.lower=going 1
word.lower=on 1
word.lower=? 1
word.lower=0 0
word.lower=what 1
word.lower=the 1
word.lower=hell' 0
word.lower=s 0
word.lower=going 1
word.lower=on 1
word.lower=there 1
word.lower=? 1
word.lower=! 0
word.lower=0 0
word.lower=get 1
word.lower=in 1
word.lower=the 1
word.lower=bag 1
word.lower=0 0
word.lower=0 0
word.lower=buttman 1
word.lower=? 1
word.lower=dans 0
word.lower=la 0
word.lower=poche 0
word.lower=0 0
word.lower=comment 0
word.lower=ça 0
word.lower=marche 0
word.lower=.. 0
word.lower=0 0
word.lower=calice 0
word.lower=.. 0
word.lower=0 0
word.lower=0 0
word.lower=have 1
word.lower=a 1
word.lower=look 1
word.lower=at 1
word.lower=this 1
word.lower=0 0
word.lower=0 0
word.lower=are 1
word.lower=you 1
word.lower=talking 1
word.lower=to 1
word.lower=me 1
word.lower=? 1
word.lower=are 1
word.lower=you 1
word.lower=talking 1
word.low

word.lower=0 0
word.lower=touche 0
word.lower=pas 0
word.lower=à 0
word.lower=ma 0
word.lower=fille 0
word.lower=0 0
word.lower=papa 1
word.lower=! 1
word.lower=dans 0
word.lower=minutes 0
word.lower=, 0
word.lower=ça 0
word.lower=saute 0
word.lower=! 0
word.lower=0 0
word.lower=martin 0
word.lower=, 0
word.lower=move 0
word.lower=! 0
word.lower=0 0
word.lower=we' 0
word.lower=re 0
word.lower=here 1
word.lower=! 1
word.lower=we' 0
word.lower=re 0
word.lower=here 1
word.lower=! 1
word.lower=donnes 1
word.lower=-y 1
word.lower=0 0
word.lower=il 0
word.lower=y 0
word.lower=a 0
word.lower=une 0
word.lower=bombe 0
word.lower=sur 0
word.lower=gabrielle 0
word.lower=0 0
word.lower=0 0
word.lower=walk 1
word.lower=over 1
word.lower=there 1
word.lower=0 0
word.lower=walk 1
word.lower=over 1
word.lower=there 1
word.lower=! 1
word.lower=move 0
word.lower=! 0
word.lower=ok 0
word.lower=, 0
word.lower=à 0
word.lower=c 0
word.lower=' 0
word.lower=t' 0
word.lower=heure 0
word.lower=, 0
word.lower=tu 

In [202]:
y_preda

NameError: name 'y_preda' is not defined

In [130]:
%%time

trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 155 ms, sys: 3.11 ms, total: 158 ms
Wall time: 157 ms


In [131]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [132]:
%%time
trainer.train('bon_cop_bad_cop.crfsuite')

CPU times: user 264 ms, sys: 8.29 ms, total: 272 ms
Wall time: 277 ms


In [133]:
tagger = pycrfsuite.Tagger()
tagger.open('bon_cop_bad_cop.crfsuite')

<contextlib.closing at 0x12d6f49e8>

## Test CRF

In [136]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 18.9 ms, sys: 688 µs, total: 19.6 ms
Wall time: 19.2 ms


In [155]:
import itertools
y_test_flat = list(map(lambda x: int(x), itertools.chain.from_iterable(y_test)))
y_pred_flat = list(map(lambda x: int(x), itertools.chain.from_iterable(y_pred)))

- Sklearn no longer lets you pass sequences of sequences so I had to binarize the test and preds

## Metrics

In [156]:
print(classification_report(y_test_flat, y_pred_flat))

             precision    recall  f1-score   support

          0       0.94      0.98      0.96      1984
          1       0.96      0.88      0.92       960

avg / total       0.95      0.95      0.95      2944



### Accuracy

In [157]:
print(accuracy_score(y_test_flat, y_pred_flat))

0.9473505434782609


### F1

In [162]:
print(f1_score(y_test_flat, y_pred_flat))

0.9155313351498637


### Recall

In [163]:
print(recall_score(y_test_flat, y_pred_flat))

0.875


### Precision

In [165]:
print(precision_score(y_test_flat, y_pred_flat))

0.96


In [167]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
0      -> 0       1.492219
1      -> 1       1.089254
0      -> 1       -0.994067
1      -> 0       -1.316379

Top unlikely transitions:
0      -> 0       1.492219
1      -> 1       1.089254
0      -> 1       -0.994067
1      -> 0       -1.316379


In [168]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
4.898535 0      word[-2:]=ue
4.180521 1      -1:word.lower=re
3.587072 1      word[-3:]=ing
3.355618 1      word.lower=we
3.315748 1      word.lower=you
3.031477 0      word[-3:]=ois
2.642684 1      word.lower=he
2.557469 0      +1:word.lower=t
2.401476 1      -1:word.lower=true
2.304134 0      word.lower=0
2.304134 0      word[-3:]=0
2.304134 0      word[-2:]=0
2.304134 0      word.isdigit=True
2.298590 1      -1:word.lower=s
2.296488 1      +1:word.lower=can'
2.292298 1      word[-3:]=ere
2.276572 1      -1:word.lower=ve
2.262047 1      word[-3:]=get
2.234258 1      -1:word.lower=ll
2.226562 0      +1:word.lower='

Top negative:
-0.835487 0      +1:word.lower=don'
-0.852967 0      word[-2:]=ey
-0.860526 1      +1:word.lower=i
-0.871512 1      word[-3:]=ais
-0.897704 0      word[-2:]=an
-0.970256 0      word.lower=no
-1.034177 0      +1:word.lower=he'
-1.039045 1      word.lower=way
-1.056343 0      word.lower=it
-1.065781 0      +1:word.lower=i'
-1.076716 1      +1:word