In [74]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import fileinput
import joblib
import glob2
import os

In [45]:
# Combining raw data into one file per language (en, es, pt)

for directory in os.listdir('./raw_data/'):
    file_list = glob2.glob(f'./raw_data/{directory}/*.txt')
    
    with open(f'./all_{directory}_text.txt', 'w') as fo:
        all_text = fileinput.input(file_list)
        fo.writelines(all_text)

In [46]:
data_train = {}
data_test = {}
train = []
test = []
train_target = []
test_target = []
labels = ['en', 'es', 'pt']
print(labels)

['en', 'es', 'pt']


In [47]:
for language in labels:
    all_content = ''
    
    with open(f'./all_{language}_data_text.txt', 'r') as fi:
        all_content = fi.read()
        splited_content = all_content.split('\n')
        temp_train = splited_content[:round(len(splited_content) * .7)]
        temp_test = splited_content[round(len(splited_content) * .7):]
        
        train += [sentence for sentence in temp_train]
        test += [sentence for sentence in temp_test]
        
        train_target += [language for _ in range(len(temp_train))]
        test_target += [language for _ in range(len(temp_test))]
        
data_train['data'] = train
data_train['target'] = train_target
data_test['data'] = test
data_test['target'] = test_target

In [54]:
print(len(data_train['data']))
print(len(data_train['target']))
print(len(data_test['data']))
print(len(data_test['target']))

627498
627498
268928
268928


In [55]:
mnnb_clf = Pipeline([
    ('_tfidf', TfidfVectorizer()),
    ('_clf', MultinomialNB()),
])

In [57]:
gs_parameters = {
    '_tfidf__ngram_range':[(1, 2), (1, 3), (1, 4), (2, 3), (2, 4)],
    '_tfidf__norm': ['l1', 'l2'],
    '_clf__alpha': (1e-2, 1e-3, 1e-4),
}
gs_clf = GridSearchCV(mnnb_clf, n_jobs=-1, param_grid=gs_parameters)

In [58]:
gs_clf.fit(data_train['data'], data_train['target'])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('_tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...ue,
        vocabulary=None)), ('_clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'_tfidf__ngram_range': [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4)], '_tfidf__norm': ['l1', 'l2'], '_clf__alpha': (0.01, 0.001, 0.0001, 1e-05)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [59]:
predicted = gs_clf.predict(data_test['data'])

In [61]:
print(classification_report(data_test['target'], predicted))

             precision    recall  f1-score   support

         en       0.94      0.99      0.97    188117
         es       0.96      0.88      0.92     58124
         pt       0.96      0.72      0.82     22687

avg / total       0.95      0.95      0.94    268928



In [64]:
gs_clf.best_estimator_.steps

[('_tfidf',
  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('_clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))]

In [73]:
for s, p, t in zip(data_test['data'], predicted, data_test['target']):
    if p != t:
        print(f'{s}\nPredicted: {p}\nTarget: {t}\n')

George Washington's Mount Vernon
Predicted: pt
Target: en

N2O5 ⇌ NO2 + NO3 → NO2 + O2 + NO
Predicted: es
Target: en

N2O5 + NO ⇌ 3 NO2
Predicted: es
Target: en

            Na
Predicted: pt
Target: en

            Na
Predicted: pt
Target: en

            NO
Predicted: es
Target: en

Na3NO4 + H2O + CO2 → NaNO3 + NaOH + NaHCO3
Predicted: es
Target: en

Microelectronics
Predicted: es
Target: en

Field-programmable gate array (FPGA)
Predicted: es
Target: en

Kings (1922–1952)
Predicted: es
Target: en

Izamal
Predicted: es
Target: en

=== La Quemada ===
Predicted: es
Target: en

=== Chupícuaro ===
Predicted: es
Target: en

The Great Mughals
Predicted: es
Target: en

A. Taghvaee, in Web Journal on Cultural Patrimony (Fabio Maniscalco ed.), vol. 1, January–June 2006
Predicted: es
Target: en

Decreasing levels of Lake Victoria Worry East African Countries
Predicted: es
Target: en

New Scientist article on Uganda's violation of the agreed curve for hydroelectric water flow.
Predicted: es
Targe


              HA
Predicted: es
Target: en

Honor (誉,yo)
Predicted: es
Target: en

Vitali, Francesco (2008) Piccolo Dizionario dell'Apocalisse, TAU Editrice, Todi
Predicted: es
Target: en

=== ICTV classification ===
Predicted: es
Target: en

Order (-virales)
Predicted: es
Target: en

Family (-viridae)
Predicted: pt
Target: en

Subfamily (-virinae)
Predicted: pt
Target: en

III: dsRNA viruses (e.g. Reoviruses)
Predicted: pt
Target: en

TD-Gammon
Predicted: es
Target: en

E. A. Thompson, A History of Attila and the Huns (1948).
Predicted: es
Target: en

=== Marlowe's Doctor Faustus ===
Predicted: es
Target: en

=== Mann's Doctor Faustus ===
Predicted: es
Target: en

Fausto (1866) by Estanislao del Campo
Predicted: es
Target: en

=== Chespirito's Faust ===
Predicted: es
Target: en

=== Stalin era ===
Predicted: es
Target: en

Ambler, John; Shaw, Denis J.B.; Symons, Leslie (1985). Soviet and East European Transport Problems. Taylor & Francis. ISBN 978-0-7099-0557-8. 
Predicted: es
Target:


== Colosseum II ==
Predicted: en
Target: es

Those Who Are About to Die Salute You – (1969)
Predicted: en
Target: es

Valentyne Suite – (1969)
Predicted: en
Target: es

Daughter of Time – (1970)
Predicted: en
Target: es

Colosseum Live – (1971)
Predicted: en
Target: es

=== Como Colosseum II ===
Predicted: en
Target: es

Strange New Flesh - (1976)
Predicted: en
Target: es

Electric Savage - (1977)
Predicted: en
Target: es

War Dance - (1977)
Predicted: en
Target: es

Lives: The Reunion Concerts 1994 – (1995)
Predicted: en
Target: es

Bread & Circuses – (1997)
Predicted: en
Target: es

Tomorrow's Blues – (2003)
Predicted: en
Target: es

Live Cologne 1994 – (2003)
Predicted: en
Target: es

Live 05 – (2007)
Predicted: en
Target: es

Time On Our Side – (2014)
Predicted: en
Target: es

The Collectors Colosseum – (1971)
Predicted: en
Target: es

Anthology – (2000) (2-CD)
Predicted: en
Target: es

Hanson, Martyn: Playing The Band - The Musical Life of Jon Hiseman, 2010, Londres, Temple Music


==== Antenas ====
Predicted: pt
Target: es

=== Tórax ===
Predicted: pt
Target: es

Coxa, segmento basal
Predicted: pt
Target: es

=== Abdomen ===
Predicted: en
Target: es

Archaeognatha (=Microcoryphia + Monura †)
Predicted: pt
Target: es

Zygentoma (=Thysanura s.str.)
Predicted: en
Target: es

Ephemeroptera (efímeras)
Predicted: pt
Target: es

Diaphanopteroidea †
Predicted: en
Target: es

Palaeodictyoptera †
Predicted: pt
Target: es

Megasecoptera †
Predicted: en
Target: es

Archodonata †
Predicted: en
Target: es

Blattodea (cucarachas)
Predicted: pt
Target: es

Isoptera (termitas)
Predicted: pt
Target: es

Mantodea (mantis)
Predicted: pt
Target: es

Dermaptera (tijeretas)
Predicted: pt
Target: es

Embioptera
Predicted: pt
Target: es

Zoraptera
Predicted: pt
Target: es

Grylloblattodea
Predicted: pt
Target: es

Mantophasmatodea
Predicted: pt
Target: es

Thysanoptera
Predicted: pt
Target: es

Phthiraptera (piojos)
Predicted: pt
Target: es

Hemiptera (chinches)
Predicted: pt
Target: e

          
Predicted: en
Target: es

            
Predicted: en
Target: es

              F
Predicted: en
Target: es

              →
Predicted: en
Target: es

            
Predicted: en
Target: es

          
Predicted: en
Target: es

        
Predicted: en
Target: es

        =
Predicted: en
Target: es

        q
Predicted: en
Target: es

        (
Predicted: en
Target: es

        
Predicted: en
Target: es

          
Predicted: en
Target: es

            
Predicted: en
Target: es

              v
Predicted: en
Target: es

              →
Predicted: en
Target: es

            
Predicted: en
Target: es

          
Predicted: en
Target: es

        
Predicted: en
Target: es

        ×
Predicted: en
Target: es

        
Predicted: en
Target: es

          
Predicted: en
Target: es

            
Predicted: en
Target: es

              B
Predicted: en
Target: es

              →
Predicted: en
Target: es

            
Predicted: en
Target: es

          
Predicted: en
Target: es

        


                2
Predicted: en
Target: pt

              
Predicted: en
Target: pt

            
Predicted: en
Target: pt

          
Predicted: en
Target: pt

        
Predicted: en
Target: pt

        =
Predicted: en
Target: pt

        
Predicted: en
Target: pt

          lim
Predicted: en
Target: pt

          
Predicted: en
Target: pt

            n
Predicted: en
Target: pt

            →
Predicted: en
Target: pt

            ∞
Predicted: en
Target: pt

          
Predicted: en
Target: pt

        
Predicted: en
Target: pt

        
Predicted: en
Target: pt

          (
Predicted: en
Target: pt

          
Predicted: en
Target: pt

            
Predicted: en
Target: pt

              
Predicted: en
Target: pt

                1
Predicted: en
Target: pt

                
Predicted: en
Target: pt

                  1
Predicted: en
Target: pt

                  
Predicted: en
Target: pt

                    2
Predicted: en
Target: pt

                  
Predicted: en
Target: pt

  


        
Predicted: en
Target: pt

      
Predicted: en
Target: pt

    
Predicted: en
Target: pt

    {\displaystyle U=U_{(S,V,N)}}
Predicted: en
Target: pt

  
Predicted: en
Target: pt

    
Predicted: en
Target: pt

      
Predicted: en
Target: pt

        V
Predicted: en
Target: pt

      
Predicted: en
Target: pt

    
Predicted: en
Target: pt

    {\displaystyle V}
Predicted: en
Target: pt

  
Predicted: en
Target: pt

    
Predicted: en
Target: pt

      
Predicted: en
Target: pt

        P
Predicted: en
Target: pt

      
Predicted: en
Target: pt

    
Predicted: en
Target: pt

    {\displaystyle P}
Predicted: en
Target: pt

  
Predicted: en
Target: pt

    
Predicted: en
Target: pt

      
Predicted: en
Target: pt

        N
Predicted: en
Target: pt

      
Predicted: en
Target: pt

    
Predicted: en
Target: pt

    {\displaystyle N}
Predicted: en
Target: pt

  
Predicted: en
Target: pt

    
Predicted: en
Target: pt

      
Predicted: en
Target: pt

        μ
Predicted: en



            
Predicted: en
Target: pt

              g
Predicted: en
Target: pt

              (
Predicted: en
Target: pt

              
Predicted: en
Target: pt

                T
Predicted: en
Target: pt

                
Predicted: en
Target: pt

                  C
Predicted: en
Target: pt

                
Predicted: en
Target: pt

              
Predicted: en
Target: pt

              )
Predicted: en
Target: pt

            
Predicted: en
Target: pt

          
Predicted: en
Target: pt

        
Predicted: en
Target: pt

      
Predicted: en
Target: pt

    
Predicted: en
Target: pt

  
Predicted: en
Target: pt

    
Predicted: en
Target: pt

      
Predicted: en
Target: pt

        f
Predicted: en
Target: pt

        (
Predicted: en
Target: pt

        
Predicted: en
Target: pt

          T
Predicted: en
Target: pt

          
Predicted: en
Target: pt

            H
Predicted: en
Target: pt

          
Predicted: en
Target: pt

        
Predicted: en
Target: pt

        ,
Pred