# Language Classification Feature Engineering
Generate character trigram features for language classification model
<br>
<b>Dataset:</b> https://downloads.tatoeba.org/exports/

In [1]:
#imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer


## Dataset

In [2]:
#Read in full dataset
data = pd.read_csv('../data/sentences.csv',
                            sep='\t', 
                            encoding='utf8', 
                            index_col=0,
                            names=['lang','text'])

#Filter by text length
len_cond = [True if 20<=len(s)<=200 else False for s in data['text']]
data = data[len_cond]

#Filter by text language
lang = ['eng','deu','spa','fra','por','ita']
data = data[data['lang'].isin(lang)]

#Select 50000 rows for each language
data_trim = pd.DataFrame(columns=['lang','text'])

for l in lang:
    lang_trim = data[data['lang'] ==l].sample(50000,random_state = 100)
    data_trim = data_trim.append(lang_trim)

#Create a random train, valid, test split
data_shuffle = data_trim.sample(frac=1)

train = data_shuffle[0:210000]
valid = data_shuffle[210000:270000]
test = data_shuffle[270000:300000]

In [3]:
#Save train, valid, test split
train.to_csv('../data/train.csv')
valid.to_csv('../data/valid.csv')
test.to_csv('../data/test.csv')

## Feature engineering 

In [4]:
train = pd.read_csv("../data/train.csv",index_col =0)
valid = pd.read_csv("../data/valid.csv",index_col =0)
test = pd.read_csv("../data/test.csv",index_col =0)
print(len(train),len(valid),len(test))
train.head()

210000 60000 30000


Unnamed: 0,lang,text
341834,fra,"Pour aggraver les choses, il commença à pleuvoir."
1108027,fra,J'ai tellement faim.
1829725,por,Nunca mais falei com ela.
3583008,ita,Ogni settimana ce n'è una nuova.
406553,por,Com certeza ele vem.


In [5]:
def get_features(corpus,n_feat=200):
    """
    Returns a list of the N most common character trigrams from a list of sentences
    params
    ------------
        corpus: list of strings
        n_feat: integer
    """
    
    #fit the n-gram model
    vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(3, 3)
                            ,max_features=n_feat)
    
    X = vectorizer.fit_transform(corpus)
    
    #Get model feature names
    feature_names = vectorizer.get_feature_names()
    
    return feature_names
    

In [6]:
#obtain trigrams from each language
features = {}
features_set = set()

for l in lang:
    
    #get corpus filtered by language
    corpus = train[train.lang==l]['text']
    
    #get 200 most frequent trigrams
    trigrams = get_features(corpus)
    
    #add to dict and set
    features[l] = trigrams 
    features_set.update(trigrams)

    
#create vocabulary list using feature set
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i


In [11]:
len(features['eng'])

200

In [16]:
#train count vectoriser using vocabulary
vectorizer = CountVectorizer(analyzer='char',
                            vocabulary=vocab)

#create feature matrix for training set
corpus = train['text']   
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

train_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
train_feat['lang'] = list(train['lang'])

In [17]:
print(len(train_feat.columns))
train_feat

664


Unnamed: 0,i p,tan,ro,eme,was,ome,ete,n s,o l,la,...,och,ada,o a,hat,pre,sto,ito,ll,ann,lang
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fra
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fra
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,por
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ita
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,por
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spa
209996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ita
209997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,deu
209998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,deu


In [86]:
#create feature matrix for validation set
corpus = valid['text']   
X = vectorizer.fit_transform(corpus)

valid_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
valid_feat['lang'] = list(valid['lang'])

#create feature matrix for test set
corpus = test['text']   
X = vectorizer.fit_transform(corpus)

test_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
test_feat['lang'] = list(test['lang'])

print(len(valid_feat.columns),len(test_feat.columns))
print(len(train_feat),len(valid_feat),len(test_feat))

666 666
210000 60000 30000


In [87]:
#Save train, valid, test split
train_feat.to_csv('../data/train_feat.csv')
valid_feat.to_csv('../data/valid_feat.csv')
test_feat.to_csv('../data/test_feat.csv')

## Modelling - Hyper-parameter tunning

In [88]:
train_feat = pd.read_csv("../data/train_feat.csv",index_col =0)
valid_feat = pd.read_csv("../data/valid_feat.csv",index_col =0)
test_feat = pd.read_csv("../data/test_feat.csv",index_col =0)
print(len(train_feat),len(valid_feat),len(test_feat))
train_feat.head()

210000 60000 30000


Unnamed: 0,m s,ux,nha,t m,ans,mi,g t,si,don,ve,...,esp,en,do,kno,son,s e,re.,uf,res,lang
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fra
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ita
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ita
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spa
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,eng
