In [1]:
from gensim.corpora.dictionary import Dictionary 
from gensim.utils import tokenize
from cltk.tokenize.word import WordTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [2]:
df = pd.read_csv('data/eng-fren 2.txt', names=['english','french'], sep='\t')
#df['en_tokens'] = df['english'].apply(simple_tokenize)

In [3]:
def tokenize_french(sentence):
    return tokenize(sentence, lower=True, deacc=False)

def tokenize_english(sentence):
    return tokenize(sentence, lower=True )
df['fr_tokens'] = df['french'].apply(tokenize_french).apply(lambda x: list(x))
df['en_tokens'] = df['english'].apply(tokenize_english).apply(lambda x: list(x))

In [4]:
en_id2word = Dictionary(df['en_tokens'])
fr_id2word = Dictionary(df['fr_tokens'])

In [5]:
df.head()

Unnamed: 0,english,french,fr_tokens,en_tokens
0,Go.,Va !,[va],[go]
1,Run!,Cours !,[cours],[run]
2,Run!,Courez !,[courez],[run]
3,Fire!,Au feu !,"[au, feu]",[fire]
4,Help!,À l'aide !,"[à, l, aide]",[help]


In [6]:
fr_id2word.token2id['à']

7

In [7]:
print(len(fr_id2word.keys()))
len(en_id2word.keys())

23197


13469

### Things I've learned so far 
french is not english, some NLP libraries break


In [8]:
def en_doc2num(tokens):
    nums=[]
    for token in tokens:
        nums.append(en_id2word.token2id[token])
    return nums
def fr_doc2num(tokens):
    nums=[]
    for token in tokens:
        nums.append(fr_id2word.token2id[token])
    return nums
df['en_bow']=df['en_tokens'].apply(en_doc2num)
df['fr_bow'] = df['fr_tokens'].apply(fr_doc2num)

In [9]:
df.to_csv('data/processed.tsv', sep='\t')

In [10]:
df.head()

Unnamed: 0,english,french,fr_tokens,en_tokens,en_bow,fr_bow
0,Go.,Va !,[va],[go],[0],[0]
1,Run!,Cours !,[cours],[run],[1],[1]
2,Run!,Courez !,[courez],[run],[1],[2]
3,Fire!,Au feu !,"[au, feu]",[fire],[2],"[3, 4]"
4,Help!,À l'aide !,"[à, l, aide]",[help],[3],"[7, 6, 5]"


In [11]:
df_nums = pd.read_csv('data/processed.tsv', sep='\t')

In [12]:
df_nums['french'].unique().shape[0] - df_nums['french'].shape[0]

-7658

In [13]:
df_nums.shape[0]-df_nums.drop_duplicates().shape[0]
df_nums.shape

(154883, 7)

Oh right, I only used one data source for the 'processed' data. Lets fix that now. 

In [14]:
df2 = pd.read_csv('data/eng-fren2.txt', names=['english','french'], sep='\t')

In [15]:
df_big = pd.concat([df,df2]).drop_duplicates()

TypeError: unhashable type: 'list'

In [None]:
df2.shape

OK, I'm going to be honest that doesn't really seem like enough data coming out of the second frame to really be worth it. 

I'm going to try sentences

In [None]:
df_sentences = pd.read_csv('data/sentences.tsv', names=['english', 'french'], sep='\t')
all_data = pd.concat([df_sentences,df_big]).drop_duplicates()

In [None]:
all_data.to_csv('data/eng_fren_full.tsv', sep='\t')

In [None]:
def nlp_to_nums(df):
    #tokenize and normalize the data
    df['french_tokens'] = df['french'].apply(tokenize, lower=True, deacc=False).apply(lambda x: list(x))
    df['english_tokens'] = df['english'].apply(tokenize, lower=True).apply(lambda x: list(x))
    print('Tokens Created')
    #create the dictionaries
    id2fren = Dictionary(df['french_tokens'])
    id2eng = Dictionary(df['english_tokens'])
    print('Dictionaries Built')
    #transform the words into numbers
    def en_doc2num(tokens):
        nums=[]
        for token in tokens:
            nums.append(id2eng.token2id[token])
        return nums
    
    def fr_doc2num(tokens):
        nums=[]
        for token in tokens:
            nums.append(id2fren.token2id[token])
        return nums
    
    df['english_bow'] = df['english_tokens'].apply(en_doc2num)
    df['french_bow'] = df['french_tokens'].apply(fr_doc2num)
    print('BOWs built')
    
    #pad the sequences 
    processed_data['english_padded'] = pad_sequences(
        processed_data['english_bow'],
        maxlen=50,
        dtype='int32',
        padding='post',
        value=-1
    ).tolist()
    
    processed_data['french_padded'] = pad_sequences(
        processed_data['french_bow'],
        maxlen=60,
        dtype='int32',
        padding='post',
        value=-1
    ).tolist()
    
    return df, id2fren, id2eng

In [None]:
processed_data, id2fren, id2eng = nlp_to_nums(all_data)

In [None]:
processed_data['eng_len']=processed_data['english_bow'].apply(lambda x: len(x))
processed_data['fren_len']=processed_data['french_bow'].apply(lambda x: len(x))
processed_data.describe()

In [None]:
processed_data['english_padded'] = ( pad_sequences(
    processed_data['english_bow'],
    maxlen=50,
    dtype='int32',
    padding='post',
    truncating='pre',
    value=-1
).tolist())

In [None]:
processed_data.head()

In [None]:
def encode_english(sentence, id2eng): 
    
    words = tokenize(sentence, lower=True)
    nums = [id2eng.token2id[token] for token in words]
    padded = pad_sequences(
        [nums],
        maxlen=50,
        dtype='int32',
        padding='post',
        truncating='post',
        value=-1
    )
    return padded

In [None]:
encode_english("this dictionary seems to be fairly strong against random words being thrown in", id2eng)

In [None]:
def decode_french(nums, id2fren):
    return [id2fren[num] for num in nums if num != -1]

In [None]:
test = processed_data['french_padded'].iloc[0]

In [None]:
decode_french(test, id2fren)

In [None]:
processed_data.to_csv('data/processed_full.tsv', sep='\t')