In [47]:
from gensim.corpora.dictionary import Dictionary 
from gensim.utils import tokenize
from cltk.tokenize.word import WordTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [17]:
df = pd.read_csv('data/eng-fren 2.txt', names=['english','french'], sep='\t')
#df['en_tokens'] = df['english'].apply(simple_tokenize)

In [80]:
def tokenize_french(sentence):
    return tokenize(sentence, lower=True, deacc=False)

def tokenize_english(sentence):
    return tokenize(sentence, lower=True )
df['fr_tokens'] = df['french'].apply(tokenize_french).apply(lambda x: list(x))
df['en_tokens'] = df['english'].apply(tokenize_english).apply(lambda x: list(x))

In [81]:
en_id2word = Dictionary(df['en_tokens'])
fr_id2word = Dictionary(df['fr_tokens'])

In [82]:
df.head()

Unnamed: 0,english,french,en_tokens,fr_tokens
0,Go.,Va !,[go],[va]
1,Run!,Cours !,[run],[cours]
2,Run!,Courez !,[run],[courez]
3,Fire!,Au feu !,[fire],"[au, feu]"
4,Help!,À l'aide !,[help],"[à, l, aide]"
5,Jump.,Saute.,[jump],[saute]
6,Stop!,Ça suffit !,[stop],"[ça, suffit]"
7,Stop!,Stop !,[stop],[stop]
8,Stop!,Arrête-toi !,[stop],"[arrête, toi]"
9,Wait!,Attends !,[wait],[attends]


In [73]:
fr_id2word.token2id['à']

7

In [56]:
print(len(fr_id2word.keys()))
len(en_id2word.keys())

23879


15107

### Things I've learned so far 
french is not english, some NLP libraries break


In [90]:
def en_doc2num(tokens):
    nums=[]
    for token in tokens:
        nums.append(en_id2word.token2id[token])
    return nums
def fr_doc2num(tokens):
    nums=[]
    for token in tokens:
        nums.append(fr_id2word.token2id[token])
    return nums
df['en_bow']=df['en_tokens'].apply(en_doc2num)
df['fr_bow'] = df['fr_tokens'].apply(fr_doc2num)

In [91]:
df.to_csv('data/processed.tsv', sep='\t')

In [92]:
df.head()

Unnamed: 0,english,french,en_tokens,fr_tokens,en_bow,fr_bow
0,Go.,Va !,[go],[va],[0],[0]
1,Run!,Cours !,[run],[cours],[1],[1]
2,Run!,Courez !,[run],[courez],[1],[2]
3,Fire!,Au feu !,[fire],"[au, feu]",[2],"[3, 4]"
4,Help!,À l'aide !,[help],"[à, l, aide]",[3],"[7, 6, 5]"


In [3]:
df_nums = pd.read_csv('data/processed.tsv', sep='\t')

In [10]:
df_nums['french'].unique().shape[0] - df_nums['french'].shape[0]

-7658

In [12]:
df_nums.shape[0]-df_nums.drop_duplicates().shape[0]
df_nums.shape

(154883, 7)

Oh right, I only used one data source for the 'processed' data. Lets fix that now. 

In [14]:
df2 = pd.read_csv('data/eng-fren2.txt', names=['english','french'], sep='\t')

In [23]:
df_big = pd.concat([df,df2]).drop_duplicates()

In [21]:
df2.shape

(145437, 2)

OK, I'm going to be honest that doesn't really seem like enough data coming out of the second frame to really be worth it. 

I'm going to try sentences

In [31]:
df_sentences = pd.read_csv('data/sentences.tsv', names=['english', 'french'], sep='\t')
all_data = pd.concat([df_sentences,df_big]).drop_duplicates()

In [33]:
all_data.to_csv('data/eng_fren_full.tsv', sep='\t')

In [67]:
def nlp_to_nums(df):
    #tokenize and normalize the data
    df['french_tokens'] = df['french'].apply(tokenize, lower=True, deacc=False).apply(lambda x: list(x))
    df['english_tokens'] = df['english'].apply(tokenize, lower=True).apply(lambda x: list(x))
    print('Tokens Created')
    #create the dictionaries
    id2fren = Dictionary(df['french_tokens'])
    id2eng = Dictionary(df['english_tokens'])
    print('Dictionaries Built')
    #transform the words into numbers
    def en_doc2num(tokens):
        nums=[]
        for token in tokens:
            nums.append(id2eng.token2id[token])
        return nums
    
    def fr_doc2num(tokens):
        nums=[]
        for token in tokens:
            nums.append(id2fren.token2id[token])
        return nums
    
    df['english_bow'] = df['english_tokens'].apply(en_doc2num)
    df['french_bow'] = df['french_tokens'].apply(fr_doc2num)
    print('BOWs built')
    
    #pad the sequences 
    processed_data['english_padded'] = pad_sequences(
        processed_data['english_bow'],
        maxlen=50,
        dtype='int32',
        padding='post',
        value=-1
    ).tolist()
    
    processed_data['french_padded'] = pad_sequences(
        processed_data['french_bow'],
        maxlen=60,
        dtype='int32',
        padding='post',
        value=-1
    ).tolist()
    
    return df, id2fren, id2eng

In [92]:
processed_data, id2fren, id2eng = nlp_to_nums(all_data)

Tokens Created
Dictionaries Built
BOWs built


In [69]:
processed_data['eng_len']=processed_data['english_bow'].apply(lambda x: len(x))
processed_data['fren_len']=processed_data['french_bow'].apply(lambda x: len(x))
processed_data.describe()

Unnamed: 0,eng_len,fren_len
count,284495.0,284495.0
mean,8.826925,9.725626
std,3.744018,4.015354
min,1.0,1.0
25%,6.0,6.0
50%,8.0,9.0
75%,13.0,14.0
max,48.0,59.0


In [65]:
processed_data['english_padded'] = ( pad_sequences(
    processed_data['english_bow'],
    maxlen=50,
    dtype='int32',
    padding='post',
    truncating='pre',
    value=-1
).tolist())

In [70]:
processed_data.head()

Unnamed: 0,english,french,french_tokens,english_tokens,english_bow,french_bow,eng_len,fren_len,english_padded,french_padded
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...,"[new, jersey, est, parfois, calme, pendant, l,...","[new, jersey, is, sometimes, quiet, during, au...","[8, 7, 5, 11, 9, 3, 2, 0, 6, 5, 10, 4, 1]","[10, 7, 4, 11, 2, 12, 8, 0, 5, 6, 4, 9, 3, 1]",13,14,"[8, 7, 5, 11, 9, 3, 2, 0, 6, 5, 10, 4, 1, -1, ...","[10, 7, 4, 11, 2, 12, 8, 0, 5, 6, 4, 9, 3, 1, ..."
1,the united states is usually chilly during jul...,les états-unis est généralement froid en juill...,"[les, états, unis, est, généralement, froid, e...","[the, united, states, is, usually, chilly, dur...","[17, 18, 16, 5, 19, 12, 3, 14, 0, 6, 5, 19, 13...","[18, 21, 20, 4, 15, 13, 3, 17, 5, 6, 14, 16, 3...",15,14,"[17, 18, 16, 5, 19, 12, 3, 14, 0, 6, 5, 19, 13...","[18, 21, 20, 4, 15, 13, 3, 17, 5, 6, 14, 16, 3..."
2,"california is usually quiet during march , and...","california est généralement calme en mars , et...","[california, est, généralement, calme, en, mar...","[california, is, usually, quiet, during, march...","[20, 5, 19, 9, 3, 23, 0, 6, 5, 19, 21, 4, 22]","[22, 4, 15, 2, 3, 25, 5, 6, 4, 15, 23, 3, 24]",13,13,"[20, 5, 19, 9, 3, 23, 0, 6, 5, 19, 21, 4, 22, ...","[22, 4, 15, 2, 3, 25, 5, 6, 4, 15, 23, 3, 24, ..."
3,the united states is sometimes mild during jun...,"les états-unis est parfois légère en juin , et...","[les, états, unis, est, parfois, légère, en, j...","[the, united, states, is, sometimes, mild, dur...","[17, 18, 16, 5, 11, 25, 3, 22, 0, 6, 5, 24, 4,...","[18, 21, 20, 4, 11, 27, 3, 24, 5, 6, 26, 13, 3...",14,14,"[17, 18, 16, 5, 11, 25, 3, 22, 0, 6, 5, 24, 4,...","[18, 21, 20, 4, 11, 27, 3, 24, 5, 6, 26, 13, 3..."
4,"your least liked fruit is the grape , but my l...","votre moins aimé fruit est le raisin , mais mo...","[votre, moins, aimé, fruit, est, le, raisin, m...","[your, least, liked, fruit, is, the, grape, bu...","[34, 31, 32, 29, 5, 17, 30, 28, 33, 31, 32, 5,...","[38, 34, 29, 30, 4, 32, 37, 33, 35, 34, 29, 4,...",14,14,"[34, 31, 32, 29, 5, 17, 30, 28, 33, 31, 32, 5,...","[38, 34, 29, 30, 4, 32, 37, 33, 35, 34, 29, 4,..."


In [95]:
def encode_english(sentence, id2eng): 
    
    words = tokenize(sentence, lower=True)
    nums = [id2eng.token2id[token] for token in words]
    padded = pad_sequences(
        [nums],
        maxlen=50,
        dtype='int32',
        padding='post',
        truncating='post',
        value=-1
    )
    return padded

In [102]:
encode_english("this dictionary seems to be fairly strong against random words being thrown in", id2eng)

array([[ 150, 3283, 1288,  109,  234, 2772,  595, 1860, 5781, 2528, 1864,
        5509,    4,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
          -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
          -1,   -1,   -1,   -1,   -1,   -1]], dtype=int32)

In [111]:
def decode_french(nums, id2fren):
    return [id2fren[num] for num in nums if num != -1]

In [112]:
test = processed_data['french_padded'].iloc[0]

In [113]:
decode_french(test, id2fren)

['new',
 'jersey',
 'est',
 'parfois',
 'calme',
 'pendant',
 'l',
 'automne',
 'et',
 'il',
 'est',
 'neigeux',
 'en',
 'avril']

In [114]:
processed_data.to_csv('data/processed_full.tsv', sep='\t')