In [31]:
import pandas as pd
import os,re,nltk
from tqdm import tqdm
from sklearn import cross_validation, linear_model, metrics 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

#### First lets start with loading the test and traning data and pre-processing. 

In [32]:
#Clean and Process text to remove unnecessary characters and text. Split paragraphs into sentences. 
def pre_process(text):
    
    text = re.sub("(<.+>)" ,"",text) #remove the xml data 
    text = re.sub("\n+","\n",text)  
    text = re.sub("[(\d)+:,\(\)\-\"\'+\\/\.\?!]"," ", text) #remove the special characters
    text = re.sub(" +"," ",text) 
    text = re.sub("^ ","",text)  #Remove spaces at the start
    text = text.split("\n")
    text_sentence_list = []
    
    for paragraph in text[1:len(text)-1]:                   #Split paragraph into sentences
        for sentence in nltk.sent_tokenize(paragraph):
            if len(sentence.split(" ")) > 3 :           #Add sentences that are greater than size 3 
                text_sentence_list.append(sentence)         

    return text_sentence_list

In [33]:
path = "text/txt"
folders_list = os.listdir(path)
train_text,train_labels = [],[]

#For each language parse text files and process them 
for i in tqdm(range(len(folders_list))):
    text_files = os.listdir(path + "/" + folders_list[i])
    sentence_count = 0
    for text_file in text_files: 
        text = open(path + "/" + folders_list[i] + "/" + text_file, encoding= 'utf8').read()
        text = pre_process(text)            
        if sentence_count <= 500: #Limit number of sentences to 500 
            train_text.extend(text)     
            train_labels.extend([folders_list[i] for j in range(len(text))])
            sentence_count += len(text) 
        else:
            break 

100%|██████████| 21/21 [00:01<00:00, 10.58it/s]


In [50]:
#Prepare the training data 
train_data = pd.DataFrame({ 'text' : train_text,
                            'language' : train_labels })
train_data = train_data.sample(frac=1)
x_train,y_train = train_data.drop(["language"],axis=1),train_data["language"]
# x_train,x_test,y_train,y_test = cross_validation.train_test_split(train_data.drop(["language"],axis=1), 
#                                                                   train_data['language'],test_size = 0.2, 
#                                                                   random_state = 0 ) 

In [56]:
#load data from the test file and process it 
test  = pd.read_csv(open("storage/europarl.test",encoding = 'utf8'), sep="\t", names = ['language','text'])
test = test.sample(frac = 1)
x_test,y_test = test.drop(["language"],axis=1),test["language"]

#### Let's create a few classes that would help us with featuring engineering and creating a pipeline for the process. The text can contain key features like the word lengths, the character variation , sentence length and word counts which vary based on language the text belongs to. 

In [41]:
class TextSelector(BaseEstimator,TransformerMixin):
    """Takes in the dataframe, outputs the text column"""
    
    def __init__(self,key):
        self.key = key 
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    

In [42]:
class  TextLengthExtractor(BaseEstimator,TransformerMixin):
    """Takes the dataframe , extracts the text and outputs the length  of the text"""

    def __init__(self):
        pass 
    
    def transform(self, df, y = None):
        return np.transpose(np.matrix(df["text"].apply(lambda x: len(x)))) 
    
    def fit(self, df, y = None ):
        return self

In [43]:
class WordCountExtractor(BaseEstimator,TransformerMixin):
    """Takes the dataframe, extracts the text and outputs the word count for the text"""
    
    def __init__(self):
        pass
    
    def word_count(self,text):
        return len(text.split(" "))
    
    def transform(self, df, y = None):
        return np.transpose(np.matrix(df["text"].apply(self.word_count)))
    
    def fit(self, df, y = None):
        return self 
    

In [44]:
class MeanWordLengthExtractor(BaseEstimator,TransformerMixin):
    """Takes the dataframe, extracts the text and outputs the mean word length for the text"""
    
    def __init__(self):
        pass
    
    def mean_length(self,text):
        return np.mean([len(word) for word in text.split(" ")])
    
    def transform(self, df, y = None):
        return np.transpose(np.matrix(df["text"].apply(self.mean_length)))
    
    def fit(self, df, y = None):
        return self 
    

In [45]:
class UniqueCharacterCountExtractor(BaseEstimator,TransformerMixin):
    """Takes the dataframe, extracts the text and outputs the number of unique characters in the text"""
    
    def __init__(self):
        pass
    
    def transform(self, df, y = None):
        return np.transpose(np.matrix(df["text"].apply(lambda x: len(set(x)))))
    
    def fit(self, df, y = None):
        return self 
    

#### Now lets use the created classes to build a pipeline. Let's try and implement RandomForest that works well with multiclass classification.

In [46]:
pipe = Pipeline([
        ('features', FeatureUnion([ 
           ('ngram_tf_idf',Pipeline([
              ('selector', TextSelector(key = "text")),    
              ('vectorizer', CountVectorizer( analyzer= "char")),
              ('transformer',TfidfTransformer())
              ])), 
             ('text_length', TextLengthExtractor()),
             ("word_count", WordCountExtractor()),
             ('mean_word_length', MeanWordLengthExtractor()),
             ('unique_character_count', UniqueCharacterCountExtractor())
            ])),   
        ('clf', RandomForestClassifier())
       ])

#### Let's look at a few of the hyperparameters that we can tweak.  

In [47]:
pipe.get_params().keys() 

dict_keys(['clf__random_state', 'memory', 'clf__min_impurity_decrease', 'features__ngram_tf_idf__steps', 'clf__verbose', 'features__ngram_tf_idf__transformer__sublinear_tf', 'features__unique_character_count', 'features__ngram_tf_idf__vectorizer__strip_accents', 'features__ngram_tf_idf__vectorizer__stop_words', 'clf__min_weight_fraction_leaf', 'features__ngram_tf_idf__selector', 'clf__max_depth', 'features__ngram_tf_idf__transformer__smooth_idf', 'clf__max_features', 'features__ngram_tf_idf__vectorizer__encoding', 'features__ngram_tf_idf__vectorizer__min_df', 'features__text_length', 'clf__warm_start', 'clf', 'clf__max_leaf_nodes', 'features__ngram_tf_idf__vectorizer__lowercase', 'features__ngram_tf_idf__vectorizer__dtype', 'features__ngram_tf_idf__transformer__use_idf', 'features__ngram_tf_idf__vectorizer__ngram_range', 'clf__min_samples_leaf', 'features__ngram_tf_idf__transformer__norm', 'features__ngram_tf_idf__vectorizer__max_df', 'features__ngram_tf_idf__vectorizer', 'clf__oob_sco

#### We can vary the ngram range and  ignore high frequency characters. 

In [52]:
hyperparameters = {
                   'features__ngram_tf_idf__vectorizer__ngram_range' :  [(1,2),(1,3),(1,4)],
                   'features__ngram_tf_idf__vectorizer__max_df' : [0.9,0.95]
                  }
classifier = GridSearchCV(pipe, hyperparameters,cv=5)
classifier.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('ngram_tf_idf', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='text')), ('vectorizer', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', i...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'features__ngram_tf_idf__vectorizer__ngram_range': [(1, 2), (1, 3), (1, 4)], 'features__ngram_tf_idf__vectorizer__max_df': [0.9, 0.95]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
classifier.best_params_

{'features__ngram_tf_idf__vectorizer__max_df': 0.9,
 'features__ngram_tf_idf__vectorizer__ngram_range': (1, 4)}

In [54]:
classifier.refit

True

In [57]:
pred = classifier.predict(x_test)

In [82]:
print ("The accuracy is "  + str(np.mean(pred == y_test)))

The accuracy is 0.9365277511042827


#### Let's try another pipeline with logistic regression classifier to see if we can see an improvement or not. 

In [62]:
pipe1 = Pipeline([
        ('features', FeatureUnion([ 
           ('ngram_tf_idf',Pipeline([
              ('selector', TextSelector(key = "text")),    
              ('vectorizer', CountVectorizer(ngram_range=(1,4), analyzer= "char")),
              ('transformer',TfidfTransformer())
              ])), 
             ('text_length', TextLengthExtractor()),
             ("word_count", WordCountExtractor()),
             ('mean_word_length', MeanWordLengthExtractor()),
             ('unique_character_count', UniqueCharacterCountExtractor())
            ])),   
        ('clf', linear_model.LogisticRegression())
       ])

In [63]:
hyperparameters = {
                   'features__ngram_tf_idf__vectorizer__ngram_range' :  [(1,2),(1,3),(1,4)],
                   'features__ngram_tf_idf__vectorizer__max_df' : [0.9,0.95]
                  }
classifier = GridSearchCV(pipe1, hyperparameters,cv=5)
classifier.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('ngram_tf_idf', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='text')), ('vectorizer', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'features__ngram_tf_idf__vectorizer__ngram_range': [(1, 2), (1, 3), (1, 4)], 'features__ngram_tf_idf__vectorizer__max_df': [0.9, 0.95]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [64]:
classifier.best_params_

{'features__ngram_tf_idf__vectorizer__max_df': 0.9,
 'features__ngram_tf_idf__vectorizer__ngram_range': (1, 4)}

In [65]:
classifier.refit

True

In [66]:
pred2 = classifier.predict(x_test)

In [83]:
print ("The accuracy is "  + str(np.mean(pred2 == y_test)))

The accuracy is 0.991453812175917


In [76]:
test["predicted"] = pred2 

In [79]:
len(test["language"])

20828

#### Let's take a look at the misclassified examples. 

In [98]:
misclassified_text =  test[test["language"] != test["predicted"]] 
print(misclassified_text)

      language                                               text predicted
18484       sk                     Stredomorská strava (rozprava)        fi
1916        cs  Zároveň je nanejvýš důležité, aby Barma zaháji...        sk
2050        da  Betragter vi nordpolsområdet som en potentiel ...        sv
6070        es                            Ciertamente, tenía uno.        fi
6111        es  De Jericó a Ramallah yo tardaba normalmente 20...        pt
18745       sk                        Zatiaľ ju však ešte nemáte.        sl
1786        cs  V konečném důsledku budeme muset přijmout komp...        et
4329        el                           Είχαμε διαπραγματεύσεις.        fi
1273        cs  Jmenovali se William Meyer, Bernard Starie, Re...        sk
14324       nl  Het consumentenrecht moet consumenten daadwerk...        fi
2858        da  Their will is the law, not only at home, but a...        en
7547        et                             Osaistungjärgu avamine        fi
6729        

In [89]:
print(misclassified_text["language"].value_counts())

sk    44
cs    24
es    18
de    11
pt    11
da     9
it     7
pl     7
fr     7
hu     6
et     5
nl     5
ro     4
sv     4
lt     4
en     4
fi     3
sl     2
lv     2
el     1
Name: language, dtype: int64


#### The slovak language has highest misclassifications followed by czech and spanish. Let's see what are they being classified as. 

In [91]:
misclassified_text[misclassified_text["language"] == "sk"]["predicted"].value_counts() 

sl    26
fi     7
cs     4
it     3
et     3
hu     1
Name: predicted, dtype: int64


#### Slovak is misclassified with slovenian which may indicate similarity in the languages. Similarly, we can see the results for other languages below.

In [95]:
print(misclassified_text[misclassified_text["language"] == "cs"]["predicted"].value_counts() )

sl    7
sk    7
fi    5
et    4
en    1
Name: predicted, dtype: int64


In [96]:
print(misclassified_text[misclassified_text["language"] == "es"]["predicted"].value_counts() )

fi    7
it    4
pt    3
et    2
ro    1
sk    1
Name: predicted, dtype: int64


### Clearly the logistic regression beats the random forest in this task. We can use more complex algorithms like RNNs or an esemble of multiple models but this would come at the cost of training time.  