In [11]:
import pysubs2
import spacy
import pandas as pd
import numpy as np
import os.path
from collections import Counter

from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab


In [2]:
nlp_en = spacy.load('en')
nlp_fr = spacy.load('fr')

In [70]:
def load_data():
    if os.path.exists('bcbc_fr_parsed') and \
        os.path.exists('bcbc_en_parsed.bin'):
        return Doc(Vocab()).from_disk('bcbc_fr_parsed.bin'), \
                    Doc(Vocab()).from_disk('bcbc_en_parsed.bin')
    else:  
        subs_french = pysubs2.load('bon_french/Bon.Cop.Bad.Cop.2006.720p.BluRay.H264.AAC-RARBG.srt',
                            encoding='iso-8859-1')
        subs_eng = pysubs2.load('bon_eng/Bon.Cop.Bad.Cop.2006.720p.BluRay.x264-.YTS.AG.srt')
        french_subs = " ".join(line.text for line in subs_french)
        french_parsed = nlp_fr(french_subs)

        eng_subs = " ".join(line.text for line in subs_eng)
        eng_parsed = nlp_en(eng_subs)

        french_parsed.to_disk('bcbc_fr_parsed.bin')
        eng_parsed.to_disk('bcbc_en_parsed.bin')

        return subs_french, french_parsed, subs_eng, eng_parsed

In [67]:
french_subs, french_parsed, eng_subs, eng_parsed = load_data()

In [71]:
[x.tag_ for x in french_parsed]
# TODO: Get tokens from each script and align them somehow
#       Maybe they don't have to be aligned, just use indexing to get
#       "corresponding" token from other script
#       Keep track of current script and switch according to prob_matrix 
#       below, choose one monolingual script for starters

['- On passe à un autre appel\\Net on parle à Patrick.',
 'Patrick, bonsoir.\\N- Salut, Ron, comment ça va?',
 '- Ah, pas pire, pas pire,\\Npas pire, pas pire... Pas!',
 "- Que c'est tu penses de ça, toi,",
 'la rumeur que Toronto\\Nserait vendu à Houston?',
 '- Toronto à Houston?\\NOuais, non, non! Non, non, non, non!',
 "La vraie rumeur,\\Nc'est pas ça du tout, Patrick.",
 '- Bien, moi, mon beau-frère,\\Nil était considéré...',
 '- Beau-frère...',
 'Beau-frère, cousin, belle-soeur,',
 'le chauffeur de taxi,',
 'le barman, euh...',
 'Mononcle, fils à mononcle,\\Nblonde du fils du beau-père,',
 'ça a pas rapport, pas deux secondes!',
 '... Il y a une équipe canadienne,',
 'ça peut être Vancouver,\\Nça peut être Calgary,',
 'ça peut être Edmonton,\\NToronto ou Montréal',
 'qui peut déménager,\\Nça, je vous le donne.',
 '- Quand même, depuis le lock-out...',
 '- Le lock-out! hâllo! Lock-out!\\NIl est où, le lock-out?',
 'Quoi, le lock-out? Lock-out?',
 '- Buttman, avec\\Nses affaires che

In [6]:
bcbc_gs = pd.read_excel('BCBC_GS_annotator1.xlsx')

    eng frn
eng
frn 

In [37]:
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'] != 'French/English']
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'] != "None"]
bcbc_gs = bcbc_gs[bcbc_gs['Listened Language'].notnull()]
bcbc_gs['Listened Language'].value_counts()

French     7650
English    5803
Name: Listened Language, dtype: int64

In [41]:
def prob_matrix(tokens, langs):
    """
    tokens: a pandas series containing all the tokens
    """
    
    switches = {lang: {}  for lang in langs}
    counts = Counter(zip(tokens, tokens[1:]))

    for (x, y), c in counts.items():
        switches[x][y] = c / float(len(tokens) - 1)
        
    return switches

In [61]:
prob = prob_matrix(bcbc_gs['Listened Language'], \
                             set(bcbc_gs['Listened Language']))
prob

{'French': {'French': 0.5489146595301814, 'English': 0.01977401129943503},
 'English': {'French': 0.019699672911091287, 'English': 0.4116116562592923}}