# Clean Data
<br>
Clean text dataset and obtain train, valid & test split
<br>
<b>Dataset:</b> https://downloads.tatoeba.org/exports/

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_original = pd.read_csv('data/sentences.csv',sep='\t', encoding='utf8', header = None)
data_original.drop([0], axis=1,inplace=True)
data_original.columns = ['lang','sent']
data_original = data_original[data_original.sent.str.len()<=200]#Only select sentences less that 200 characters
data_original = data_original[data_original.sent.str.len()>=20]#Only select sentences greater that 20 characters
print(len(data_original)) #5788767 rows
print(data_original['lang'].nunique()) #314 unique languages
data_original.head()

5788767
314


Unnamed: 0,lang,sent
4,cmn,今天是６月１８号，也是Muiriel的生日！
20,cmn,选择什么是“对”或“错”是一项艰难的任务，我们却必须要完成它。
66,cmn,我们看东西不是看其实质，而是以我们的主观意识看它们的。
70,cmn,生活就是當你忙著進行你的計劃時總有其他的事情發生。
75,deu,Lass uns etwas versuchen!


In [3]:
#We want English, German, Russian, Spanish, French, Japanese, Portuguese, Italian
lang = ['eng','deu','spa','fra','por','ita']
data = data_original[data_original['lang'].isin(lang)]
print(len(data)) #2759972 rows
data.head()

2759972


Unnamed: 0,lang,sent
75,deu,Lass uns etwas versuchen!
76,deu,Ich muss schlafen gehen.
78,deu,Heute ist der 18. Juni und das ist der Geburts...
79,deu,"Herzlichen Glückwunsch zum Geburtstag, Muiriel!"
80,deu,Muiriel ist jetzt 20.


In [4]:
data['lang'].value_counts()

eng    966443
ita    535320
deu    406641
fra    331154
por    263212
spa    257202
Name: lang, dtype: int64

In [5]:
#Select 50000 rows for each language
data_trim = pd.DataFrame(columns=['lang','sent'])
data_trim
for l in lang:
    data_trim = data_trim.append(data[data['lang'] ==l].sample(50000,random_state = 100))
data_trim['lang'].value_counts()

fra    50000
deu    50000
eng    50000
por    50000
ita    50000
spa    50000
Name: lang, dtype: int64

In [6]:
#Divide data into training, validation and testing
from sklearn.model_selection import train_test_split

X = data_trim['sent']
y = data_trim['lang']
X_train, X, y_train, y = train_test_split(X, y, test_size=0.30, random_state=101)
X_valid, X_test, y_valid, y_test = train_test_split(X, y, test_size=1/3, random_state=101)


In [9]:
#save datsets
train = pd.concat([pd.Series(y_train),pd.Series(X_train) ], axis=1)
valid = pd.concat([pd.Series(y_valid),pd.Series(X_valid) ], axis=1)
test = pd.concat([pd.Series(y_test),pd.Series(X_test) ], axis=1)
print(len(train), len(valid), len(test))

train.to_csv('data/train.csv')
valid.to_csv('data/valid.csv')
test.to_csv('data/test.csv')


210000 60000 30000
