In [17]:
import pysubs2
import spacy
import pandas as pd
import numpy as np
import os.path

from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab


In [3]:
nlp_en = spacy.load('en')
nlp_fr = spacy.load('fr')

In [6]:
def load_data():
    if os.path.exists('bcbc_fr_parsed') and\
        os.path.exists('bcbc_en_parsed.bin'):
        return Doc(Vocab()).from_disk('bcbc_fr_parsed.bin'), \
                    Doc(Vocab()).from_disk('bcbc_en_parsed.bin')
    else:  
        subs_french = pysubs2.load('bon_french/Bon.Cop.Bad.Cop.2006.720p.BluRay.H264.AAC-RARBG.srt',
                            encoding='iso-8859-1')
        subs_eng = pysubs2.load('bon_eng/Bon.Cop.Bad.Cop.2006.720p.BluRay.x264-.YTS.AG.srt')
        french_subs = " ".join([line.text for line in subs_french])
        french_parsed = nlp_fr(french_subs)

        eng_subs = " ".join([line.text for line in subs_eng])
        eng_parsed = nlp_en(eng_subs)

        french_parsed.to_disk('bcbc_fr_parsed.bin')
        eng_parsed.to_disk('bcbc_en_parsed.bin')

        return french_parsed, eng_parsed

In [7]:
french_parsed, eng_parsed = load_data()

In [15]:
[x.tag_ for x in french_parsed]

['PUNCT___',
 'PRON__Number=Sing|Person=3',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'ADP___',
 'DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Art',
 'ADJ__Number=Sing',
 'ADV___',
 'PRON__Number=Sing|Person=3',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'ADP___',
 'PROPN__Gender=Masc|Number=Sing',
 'PUNCT___',
 'PROPN__Gender=Masc|Number=Sing',
 'PUNCT___',
 'NOUN__Gender=Masc|Number=Sing',
 'PROPN___',
 'PUNCT___',
 'PROPN___',
 'PUNCT___',
 'ADV__PronType=Int',
 'PRON__Gender=Masc|Number=Sing|Person=3|PronType=Dem',
 'VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'PUNCT___',
 'PUNCT___',
 'NOUN__Number=Plur',
 'PUNCT___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV__Polarity=Neg',
 'ADJ__Number=Sing',
 'PUNCT___',
 'ADV___',
 'PUNCT___',
 'PUNCT___',
 'PRON__PronType=Int',
 'PRON__Number=Sing|Person=3',
 'AUX__Mood=Ind|Number=Sing|P

In [25]:
bcbc_gs = pd.read_excel('BCBC_GS_annotator1.xlsx')

    eng frn
eng
frn 

In [92]:
bcbc_gs['Listened Language'].value_counts()

French            7650
English           5802
None                36
French/English      11
E                    1
Name: Listened Language, dtype: int64

In [89]:
def calculate_prob_matrix(tokens):
    """
    tokens: a pandas series containing all the tokens
    """
    
    prob = np.zeros((2, 2))
    
    for i in range(len(tokens) - 1 ):
        if tokens.iloc[i] == "English":
            if tokens.iloc[i + 1] == "English":
                prob[0, 0] += 1
            elif  tokens.iloc[i] == "French":
                prob[0, 1] += 1
        elif tokens.iloc[i] == "French":
            if tokens.iloc[i + 1] == "English":
                prob[1, 0] += 1
            elif  tokens.iloc[i] == "French":
                prob[1, 1] += 1            
        else:
            print(tokens.iloc[i])
            raise ValueError("There can only be two possiblities!")
        
        prob /= (len(tokens) - 1)
        
    return prob

In [90]:
prob = calculate_prob_matrix(bcbc_gs['Listened Language'])

French/English


ValueError: There can only be two possiblities!