# Language Classification Feature Engineering
Generate character trigram features for language classification model
<br>
<b>Dataset:</b> https://downloads.tatoeba.org/exports/

In [18]:
#imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer


## Dataset

In [12]:
#Read in full dataset
data = pd.read_csv('../data/sentences.csv',
                            sep='\t', 
                            encoding='utf8', 
                            index_col=0,
                            names=['lang','text'])

#Filter by text length
len_cond = [True if 20<=len(s)<=200 else False for s in data['text']]
data = data[len_cond]

#Filter by text language
lang = ['eng','deu','spa','fra','por','ita']
data = data[data['lang'].isin(lang)]

#Select 50000 rows for each language
data_trim = pd.DataFrame(columns=['lang','text'])

for l in lang:
    lang_trim = data[data['lang'] ==l].sample(50000,random_state = 100)
    data_trim = data_trim.append(lang_trim)

#Create a random train, valid, test split
data_shuffle = data_trim.sample(frac=1)

train = data_shuffle[0:210000]
valid = data_shuffle[210000:270000]
test = data_shuffle[270000:300000]

In [None]:
#Save train, valid, test split
train.to_csv('../data/train.csv')
valid.to_csv('../data/valid.csv')
test.to_csv('../data/test.csv')

## Feature engineering 

In [16]:
train = pd.read_csv("../data/train.csv",index_col =0)
valid = pd.read_csv("../data/valid.csv",index_col =0)
test = pd.read_csv("../data/test.csv",index_col =0)
print(len(train),len(valid),len(test))
train.head()

210000 60000 30000


Unnamed: 0,lang,text
4716404,fra,Tom y va trois fois par semaine.
2697085,ita,"Se non fosse stato per il tuo coraggio, sarest..."
3158361,ita,Io sono impegnato a preparare il prossimo esame.
1251601,spa,Ganar la lotería es una forma fácil de hacer d...
6107210,eng,Layla had been fantasizing about killing her p...


In [42]:
def get_features(corpus,n_feat=200):
    """
    Returns a list of the N most common character trigrams from a list of sentences
    
    """
    
    #fit the n-gram model
    vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(3, 3)
                            ,max_features=n_feat)
    
    X = vectorizer.fit_transform(corpus)
    
    #Get model feature names
    feature_names = vectorizer.get_feature_names()
    
    return feature_names
    

In [44]:
features = {}
for l in lang:
    
    
        #get corpus filtered by language
    corpus = train[train.lang==l]['text']
    
    
    features[l] = get_features(corpus)
    print(len(features[l]))

200
200
200
200
200
200


In [45]:
features

{'eng': [' a ',
  ' al',
  ' an',
  ' ar',
  ' as',
  ' be',
  ' bo',
  ' ca',
  ' co',
  ' di',
  ' do',
  ' fo',
  ' fr',
  ' go',
  ' ha',
  ' he',
  ' hi',
  ' ho',
  ' i ',
  ' in',
  ' is',
  ' it',
  ' kn',
  ' li',
  ' lo',
  ' ma',
  ' me',
  ' mo',
  ' ne',
  ' no',
  ' of',
  ' on',
  ' pr',
  ' re',
  ' sa',
  ' se',
  ' sh',
  ' so',
  ' st',
  ' th',
  ' to',
  ' wa',
  ' we',
  ' wh',
  ' wi',
  ' wo',
  ' yo',
  "'s ",
  "'t ",
  'aid',
  'all',
  'ami',
  'an ',
  'and',
  'ant',
  'are',
  'ary',
  'as ',
  'at ',
  'at.',
  'ave',
  'be ',
  'can',
  'd a',
  'd h',
  'd m',
  'd t',
  'did',
  "dn'",
  'do ',
  'don',
  'e a',
  'e b',
  'e c',
  'e d',
  'e h',
  'e i',
  'e m',
  'e o',
  'e p',
  'e s',
  'e t',
  'e w',
  'ear',
  'ed ',
  'en ',
  'ent',
  'er ',
  'er.',
  'ere',
  'es ',
  'et ',
  'eve',
  'for',
  'g t',
  'ght',
  'hat',
  'hav',
  'he ',
  'her',
  'hin',
  'his',
  'hou',
  'ht ',
  'i d',
  'i w',
  'id ',
  'ike',
  'ill',
  'in ',
  '

In [52]:
features_set = set()
for l in lang:
    features_set.update(features[l])
    
len(features_set)


665

In [54]:
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i
vocab


{'m s': 0,
 'ux ': 1,
 'nha': 2,
 't m': 3,
 'ans': 4,
 ' mi': 5,
 'g t': 6,
 'si ': 7,
 'don': 8,
 've ': 9,
 'não': 10,
 'do.': 11,
 'e j': 12,
 'tra': 13,
 'n a': 14,
 ' zu': 15,
 ' cu': 16,
 'ein': 17,
 ' gi': 18,
 ' lo': 19,
 'di ': 20,
 'ser': 21,
 'thi': 22,
 ' bo': 23,
 'y a': 24,
 'man': 25,
 'sso': 26,
 ' pu': 27,
 'com': 28,
 ' ac': 29,
 'nd ': 30,
 't a': 31,
 're ': 32,
 'ste': 33,
 'our': 34,
 ' nã': 35,
 'ill': 36,
 'con': 37,
 'ehr': 38,
 'lo ': 39,
 'pro': 40,
 ' so': 41,
 'ert': 42,
 'lei': 43,
 ' ta': 44,
 'i w': 45,
 'can': 46,
 'tom': 47,
 'eux': 48,
 'hab': 49,
 'en.': 50,
 'out': 51,
 'wha': 52,
 'ass': 53,
 'tar': 54,
 'voi': 55,
 'tes': 56,
 's l': 57,
 'eu ': 58,
 't w': 59,
 'ihr': 60,
 's m': 61,
 'su ': 62,
 'to.': 63,
 ' ih': 64,
 'ver': 65,
 'ai ': 66,
 ' ch': 67,
 'mon': 68,
 'hre': 69,
 'de ': 70,
 'más': 71,
 'azi': 72,
 'she': 73,
 'mme': 74,
 'a p': 75,
 'non': 76,
 'ang': 77,
 't t': 78,
 ' du': 79,
 'ero': 80,
 'ary': 81,
 'ers': 82,
 'ido': 83,
 '

In [57]:
vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(3, 3),
                            max_features=200,
                            vocabulary=vocab)

corpus = train['text']
    
X = vectorizer.fit_transform(corpus)


In [62]:
feature_names = vectorizer.get_feature_names()

train_ngram = pd.DataFrame(data=X.toarray(),columns=feature_names)
train_ngram['lang'] = train['lang']
