In [23]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from sklearn.metrics import classification_report

In [7]:
dataset = pd.read_csv('../raw_data/language_sets.tsv', delimiter='\t', index_col=0)
train = dataset[dataset.set == 'train'].copy()
val = dataset[dataset.set == 'val'].copy()
test = dataset[dataset.set == 'test'].copy()

In [11]:
one_hot_vec = CountVectorizer(analyzer='char_wb', ngram_range=(2,2), binary=True)
count_vec = CountVectorizer(analyzer='char_wb', ngram_range=(2,2))
bernoulli = BernoulliNB()
multi = MultinomialNB()
gaus = GaussianNB()

In [22]:
one_hot_X = one_hot_vec.fit_transform(train.content)
count_X = count_vec.fit_transform(train.content)

# fit bernoulli
bernoulli = bernoulli.fit(one_hot_X, train.language)

# fit multinomilal
multi = multi.fit(count_X, train.language)

#fit gaussian
gaus_X = count_X.A
gaus = gaus.fit(gaus_X, train.language)

## Multinomial Report

In [25]:
multi_validation_X = count_vec.transform(val.content)

multi_preds = multi.predict(multi_validation_X)

print(classification_report(y_true = val.language, y_pred = multi_preds))

             precision    recall  f1-score   support

         ar       1.00      1.00      1.00        17
         de       0.89      1.00      0.94        17
         en       1.00      1.00      1.00        17
         es       1.00      1.00      1.00        17
         fr       1.00      1.00      1.00        17
         it       1.00      1.00      1.00        17
         ja       1.00      1.00      1.00        17
         nl       1.00      0.88      0.94        17
         pl       1.00      1.00      1.00        17
         pt       1.00      1.00      1.00        17
         ru       1.00      1.00      1.00        17

avg / total       0.99      0.99      0.99       187



## Gaussian Report

In [41]:
gaus_validation_X = count_vec.transform(val.content).A

gaus_preds = gaus.predict(gaus_validation_X)

print(classification_report(y_true = val.language, y_pred = gaus_preds))

             precision    recall  f1-score   support

         ar       1.00      1.00      1.00        17
         de       0.94      1.00      0.97        17
         en       0.89      0.94      0.91        17
         es       1.00      1.00      1.00        17
         fr       1.00      1.00      1.00        17
         it       0.94      1.00      0.97        17
         ja       1.00      1.00      1.00        17
         nl       1.00      0.82      0.90        17
         pl       1.00      1.00      1.00        17
         pt       1.00      1.00      1.00        17
         ru       1.00      1.00      1.00        17

avg / total       0.98      0.98      0.98       187



## Bernoulli Report

In [49]:
bern_validation_X = one_hot_vec.transform(val.content)

bern_preds = bernoulli.predict(bern_validation_X)

print(classification_report(y_true = val.language, y_pred = bern_preds))

             precision    recall  f1-score   support

         ar       0.94      1.00      0.97        17
         de       1.00      1.00      1.00        17
         en       1.00      0.94      0.97        17
         es       1.00      0.94      0.97        17
         fr       1.00      1.00      1.00        17
         it       1.00      1.00      1.00        17
         ja       1.00      1.00      1.00        17
         nl       0.94      1.00      0.97        17
         pl       1.00      1.00      1.00        17
         pt       1.00      1.00      1.00        17
         ru       1.00      1.00      1.00        17

avg / total       0.99      0.99      0.99       187

