# Naive Bayes

## Load imports.

In [1]:
from sklearn.metrics import classification_report, confusion_matrix

# Make common scripts visible
import sys
sys.path.append('../common/')

from loader import load_preprocessed_data
from classification import run_bernoulli_naive_bayes, run_multinomial_naive_bayes, run_multinomial_naive_bayes_tfidf
from lookup_tables import topic_code_to_topic_dict

## Load the train and test data.

Use already lemmatized data.

In [2]:
x, y = load_preprocessed_data('data/rcv1_lemmatized.csv')

# Split data into 80% train, 20% test
total_examples = len(y)
split_point = int(total_examples * 0.8)
train_x = x[:split_point]
train_y = y[:split_point]
test_x = x[split_point:]
test_y = y[split_point:]

print('Number of training examples: {}'.format(len(train_x)))

Number of training examples: 72488


## Assess Bernoulli Naive Bayes baseline classification performance.

Run Bernoulli Naive Bayes and report classification accuracy.

In [3]:
predict_y = run_bernoulli_naive_bayes(train_x,
                                      train_y,
                                      test_x,
                                      test_y,
                                      ngram_range = (1, 1))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.884829  0.958938  0.920394      6137
  ECONOMIC PERFORMANCE   0.932183  0.940576  0.936361      1666
             ELECTIONS   0.849628  0.856004  0.852804      2132
                HEALTH   0.952785  0.804703  0.872506       978
              RELIGION   0.888430  0.505882  0.644678       425
                SPORTS   0.990569  0.959906  0.974996      6784

             micro avg   0.926553  0.926553  0.926553     18122
             macro avg   0.916404  0.837668  0.866956     18122
          weighted avg   0.928377  0.926553  0.925300     18122

[[5885   34  161   21   14   22]
 [  22 1567   66    1    1    9]
 [ 222   64 1825    2    3   16]
 [ 148   15   19  787    3    6]
 [ 176    0   21    4  215    9]
 [ 198    1   56   11    6 6512]]


In [4]:
predict_y = run_bernoulli_naive_bayes(train_x,
                                      train_y,
                                      test_x,
                                      test_y,
                                      ngram_range = (1, 2))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.650313  0.982728  0.782688      6137
  ECONOMIC PERFORMANCE   0.982729  0.341537  0.506904      1666
             ELECTIONS   0.963583  0.459193  0.621982      2132
                HEALTH   1.000000  0.009202  0.018237       978
              RELIGION   0.000000  0.000000  0.000000       425
                SPORTS   0.923247  0.985849  0.953522      6784

             micro avg   0.787772  0.787772  0.787772     18122
             macro avg   0.753312  0.463085  0.480555     18122
          weighted avg   0.823522  0.787772  0.742768     18122

[[6031    1    5    0    0  100]
 [ 824  569   18    0    0  255]
 [1054    9  979    0    0   90]
 [ 887    0    3    9    0   79]
 [ 382    0   11    0    0   32]
 [  96    0    0    0    0 6688]]


  'precision', 'predicted', average, warn_for)


## Assess Multinomial Naive Bayes performance using term counts

In [5]:
predict_y = run_multinomial_naive_bayes(train_x,
                                        train_y,
                                        test_x,
                                        ngram_range = (1, 1))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.953675  0.949324  0.951494      6137
  ECONOMIC PERFORMANCE   0.938596  0.963385  0.950829      1666
             ELECTIONS   0.893645  0.949812  0.920873      2132
                HEALTH   0.909645  0.916155  0.912888       978
              RELIGION   0.872093  0.882353  0.877193       425
                SPORTS   0.997735  0.973909  0.985678      6784

             micro avg   0.956517  0.956517  0.956517     18122
             macro avg   0.927565  0.939157  0.933159     18122
          weighted avg   0.957431  0.956517  0.956801     18122

[[5826   40  151   65   40   15]
 [  16 1605   41    2    2    0]
 [  43   56 2025    5    3    0]
 [  50    8   19  896    5    0]
 [  33    0   14    3  375    0]
 [ 141    1   16   14    5 6607]]


In [6]:
predict_y = run_multinomial_naive_bayes(train_x,
                                        train_y,
                                        test_x,
                                        ngram_range = (1, 2))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.893195  0.983868  0.936342      6137
  ECONOMIC PERFORMANCE   0.969809  0.944778  0.957130      1666
             ELECTIONS   0.920657  0.946998  0.933642      2132
                HEALTH   0.992669  0.692229  0.815663       978
              RELIGION   0.994624  0.435294  0.605565       425
                SPORTS   0.996706  0.981132  0.988858      6784

             micro avg   0.946308  0.946308  0.946308     18122
             macro avg   0.961276  0.830717  0.872866     18122
          weighted avg   0.949966  0.946308  0.943324     18122

[[6038   11   62    3    1   22]
 [  38 1574   54    0    0    0]
 [  84   28 2019    1    0    0]
 [ 271    9   21  677    0    0]
 [ 207    0   33    0  185    0]
 [ 122    1    4    1    0 6656]]


## Assess Multinomial Naive Bayes performance using TF-IDF

In [7]:
predict_y = run_multinomial_naive_bayes_tfidf(train_x,
                                              train_y,
                                              test_x,
                                              ngram_range = (1, 1))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.774169  0.994297  0.870533      6137
  ECONOMIC PERFORMANCE   0.984491  0.876351  0.927279      1666
             ELECTIONS   0.961279  0.803471  0.875319      2132
                HEALTH   1.000000  0.332311  0.498849       978
              RELIGION   1.000000  0.037647  0.072562       425
                SPORTS   0.997136  0.975088  0.985989      6784

             micro avg   0.895652  0.895652  0.895652     18122
             macro avg   0.952846  0.669861  0.705088     18122
          weighted avg   0.916469  0.895652  0.880761     18122

[[6102    1   20    0    0   14]
 [ 169 1460   36    0    0    1]
 [ 399   20 1713    0    0    0]
 [ 647    2    3  325    0    1]
 [ 397    0    9    0   16    3]
 [ 168    0    1    0    0 6615]]


In [8]:
predict_y = run_multinomial_naive_bayes_tfidf(train_x,
                                              train_y,
                                              test_x,
                                              ngram_range = (1, 2))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.711277  0.995926  0.829871      6137
  ECONOMIC PERFORMANCE   0.992652  0.810924  0.892633      1666
             ELECTIONS   0.986695  0.660882  0.791573      2132
                HEALTH   1.000000  0.049080  0.093567       978
              RELIGION   1.000000  0.002353  0.004695       425
                SPORTS   0.995367  0.981722  0.988497      6784

             micro avg   0.859784  0.859784  0.859784     18122
             macro avg   0.947665  0.583481  0.600139     18122
          weighted avg   0.898249  0.859784  0.831429     18122

[[6112    0    2    0    0   23]
 [ 301 1351   13    0    0    1]
 [ 713    8 1409    0    0    2]
 [ 925    2    2   48    0    1]
 [ 418    0    2    0    1    4]
 [ 124    0    0    0    0 6660]]
