In [None]:
import numpy as np 
import pandas as pd 
import torch
import os
from io import open
import unicodedata
import string
import re
import random
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

## Preprocessing the dataset
* Extract and split the text files into train, validation and test sets.
* Store the 3 sets in csv format for further reference

In [None]:
english_path = "/content/europarl-v7.fr-en.en"
french_path = "/content/europarl-v7.fr-en.en"

In [None]:
def load_document(filename):
    
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

def doc_to_sentences(doc):
    return doc.strip().split('\n')
 
def len_of_sentences(sentences):
    length = [len(s.split()) for s in sentences]
    return min(length), max(length)
 
# English data
doc = load_document(english_path)
english_sentences = doc_to_sentences(doc)
min_len, max_len = len_of_sentences(english_sentences)
print('English: total sentences=%d, minimum=%d, maximum=%d' % (len(english_sentences), min_len, max_len))
 
# French data
doc = load_document(french_path)
french_sentences = doc_to_sentences(doc)
min_len, max_len = len_of_sentences(french_sentences)
print('French: total sentences=%d, minimum=%d, maximum=%d' % (len(french_sentences), min_len, max_len))

#Creating a dataframe
data_frame = pd.DataFrame()
data_frame['en'] =  english_sentences
data_frame['fr'] = french_sentences
print(f'Shape of Dataframe: {data_frame.shape}')
data_frame.head()

English: total sentences=2007723, minimum=0, maximum=668
French: total sentences=2007723, minimum=0, maximum=668
Shape of Dataframe: (2007723, 2)


Unnamed: 0,en,fr
0,Resumption of the session,Resumption of the session
1,I declare resumed the session of the European ...,I declare resumed the session of the European ...
2,"Although, as you will have seen, the dreaded '...","Although, as you will have seen, the dreaded '..."
3,You have requested a debate on this subject in...,You have requested a debate on this subject in...
4,"In the meantime, I should like to observe a mi...","In the meantime, I should like to observe a mi..."


In [None]:
data_frame['en'] = data_frame['en'].apply(lambda x:x.lower())
data_frame['fr'] = data_frame['fr'].apply(lambda x:x.lower())

In [None]:
#splitting a dataset into train, validation and test sets
test_df = data_frame.copy().sample(frac=0.5,
                            random_state=42
                            ).reset_index(drop=True)

train_df = pd.concat([test_df, data_frame]).drop_duplicates(keep=False)

val_df = train_df.copy().sample(frac=0.1,
                            random_state=42
                            ).reset_index(drop=True)

train_df = pd.concat([val_df, train_df]).drop_duplicates(keep=False)

print(f'shape of train_df: {train_df.shape}\n shape of val_data: {val_df.shape}\n\
shape of test_data: {test_df.shape}')


shape of train_df: (874968, 2)
 shape of val_data: (97219, 2)
shape of test_data: (1003862, 2)


### sample and filter the data to allow a feasible training regime

> Indented block



In [None]:
%time
def filter_df(df, max_len):
    size = 30000
    m_df = df[:size]
    for i, row in m_df.iterrows():
        filtered_data = len(m_df.loc[i]['en'].split(' ')) < max_len and \
        len(m_df.loc[i]['fr'].split(' ')) < max_len
        if filtered_data == False:
            m_df.drop(i, inplace=True)
        else:
            continue    
    return m_df

# datasets filtering
train_df['en'] = train_df['en'].apply(lambda x:str(x))
train_df['fr'] = train_df['fr'].apply(lambda x:str(x))

test_df['en'] = test_df['en'].apply(lambda x:str(x))
test_df['fr'] = test_df['fr'].apply(lambda x:str(x))

val_df['en'] = val_df['en'].apply(lambda x:str(x))
val_df['fr'] = val_df['fr'].apply(lambda x:str(x))

print("Train data filtering...")
train_data = filter_df(train_df, 40)
print("Test data filtering...")
test_data = filter_df(test_df, 40)
print("Validation data filtering...")
val_data = filter_df(val_df, 40)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.34 µs
Train data filtering...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Test data filtering...
Validation data filtering...


In [None]:
train_data.to_csv('/content/train.csv', index=False)
test_data.to_csv('/content/test.csv', index=False)
val_data.to_csv('/content/val.csv', index=False)