# Naive Bayes

## Load imports.

In [1]:
from sklearn.metrics import classification_report, confusion_matrix

# Make common scripts visible
import sys
sys.path.append('../common/')

from loader import load_preprocessed_data
from classification import run_bernoulli_naive_bayes, run_multinomial_naive_bayes, run_multinomial_naive_bayes_tfidf
from lookup_tables import topic_code_to_topic_dict

## Load the train and test data.

Use already lemmatized data.

In [2]:
x, y = load_preprocessed_data('data/rcv1_lemmatized_reduced.csv')

# Split data into 80% train, 20% test
total_examples = len(y)
split_point = int(total_examples * 0.8)
train_x = x[:split_point]
train_y = y[:split_point]
test_x = x[split_point:]
test_y = y[split_point:]

print('Number of training examples: {}'.format(len(train_x)))

Number of training examples: 2091


## Assess Bernoulli Naive Bayes baseline classification performance.

Run Bernoulli Naive Bayes and report classification accuracy.

In [3]:
predict_y = run_bernoulli_naive_bayes(train_x,
                                      train_y,
                                      test_x,
                                      test_y,
                                      ngram_range = (1, 1))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.708812  0.968586  0.818584       191
  ECONOMIC PERFORMANCE   0.954545  0.724138  0.823529        58
             ELECTIONS   0.775000  0.563636  0.652632        55
                HEALTH   1.000000  0.250000  0.400000        36
              RELIGION   1.000000  0.272727  0.428571        11
                SPORTS   1.000000  0.965116  0.982249       172

             micro avg   0.833652  0.833652  0.833652       523
             macro avg   0.906393  0.624034  0.684261       523
          weighted avg   0.864956  0.833652  0.818490       523

[[185   0   6   0   0   0]
 [ 15  42   1   0   0   0]
 [ 22   2  31   0   0   0]
 [ 25   0   2   9   0   0]
 [  8   0   0   0   3   0]
 [  6   0   0   0   0 166]]


In [4]:
predict_y = run_bernoulli_naive_bayes(train_x,
                                      train_y,
                                      test_x,
                                      test_y,
                                      ngram_range = (1, 2))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.514825  1.000000  0.679715       191
  ECONOMIC PERFORMANCE   1.000000  0.034483  0.066667        58
             ELECTIONS   1.000000  0.109091  0.196721        55
                HEALTH   0.000000  0.000000  0.000000        36
              RELIGION   0.000000  0.000000  0.000000        11
                SPORTS   1.000000  0.837209  0.911392       172

             micro avg   0.655832  0.655832  0.655832       523
             macro avg   0.585804  0.330130  0.309083       523
          weighted avg   0.732947  0.655832  0.576045       523

[[191   0   0   0   0   0]
 [ 56   2   0   0   0   0]
 [ 49   0   6   0   0   0]
 [ 36   0   0   0   0   0]
 [ 11   0   0   0   0   0]
 [ 28   0   0   0   0 144]]


  'precision', 'predicted', average, warn_for)


## Assess Multinomial Naive Bayes performance using term counts

In [5]:
predict_y = run_multinomial_naive_bayes(train_x,
                                        train_y,
                                        test_x,
                                        ngram_range = (1, 1))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.943299  0.958115  0.950649       191
  ECONOMIC PERFORMANCE   0.933333  0.965517  0.949153        58
             ELECTIONS   0.777778  0.890909  0.830508        55
                HEALTH   1.000000  0.777778  0.875000        36
              RELIGION   1.000000  0.818182  0.900000        11
                SPORTS   1.000000  0.982558  0.991202       172

             micro avg   0.944551  0.944551  0.944551       523
             macro avg   0.942402  0.898843  0.916085       523
          weighted avg   0.948530  0.944551  0.944913       523

[[183   0   8   0   0   0]
 [  1  56   1   0   0   0]
 [  2   4  49   0   0   0]
 [  3   0   5  28   0   0]
 [  2   0   0   0   9   0]
 [  3   0   0   0   0 169]]


In [6]:
predict_y = run_multinomial_naive_bayes(train_x,
                                        train_y,
                                        test_x,
                                        ngram_range = (1, 2))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.874419  0.984293  0.926108       191
  ECONOMIC PERFORMANCE   0.963636  0.913793  0.938053        58
             ELECTIONS   0.894737  0.927273  0.910714        55
                HEALTH   1.000000  0.583333  0.736842        36
              RELIGION   1.000000  0.727273  0.842105        11
                SPORTS   1.000000  0.970930  0.985251       172

             micro avg   0.933078  0.933078  0.933078       523
             macro avg   0.955465  0.851149  0.889846       523
          weighted avg   0.939035  0.933078  0.930470       523

[[188   0   3   0   0   0]
 [  4  53   1   0   0   0]
 [  2   2  51   0   0   0]
 [ 13   0   2  21   0   0]
 [  3   0   0   0   8   0]
 [  5   0   0   0   0 167]]


## Assess Multinomial Naive Bayes performance using TF-IDF

In [7]:
predict_y = run_multinomial_naive_bayes_tfidf(train_x,
                                              train_y,
                                              test_x,
                                              ngram_range = (1, 1))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.692029  1.000000  0.817987       191
  ECONOMIC PERFORMANCE   0.979592  0.827586  0.897196        58
             ELECTIONS   0.967742  0.545455  0.697674        55
                HEALTH   0.000000  0.000000  0.000000        36
              RELIGION   0.000000  0.000000  0.000000        11
                SPORTS   1.000000  0.970930  0.985251       172

             micro avg   0.833652  0.833652  0.833652       523
             macro avg   0.606560  0.557328  0.566351       523
          weighted avg   0.792007  0.833652  0.795618       523

[[191   0   0   0   0   0]
 [  9  48   1   0   0   0]
 [ 24   1  30   0   0   0]
 [ 36   0   0   0   0   0]
 [ 11   0   0   0   0   0]
 [  5   0   0   0   0 167]]


  'precision', 'predicted', average, warn_for)


In [8]:
predict_y = run_multinomial_naive_bayes_tfidf(train_x,
                                              train_y,
                                              test_x,
                                              ngram_range = (1, 2))
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))
print(confusion_matrix(test_y, predict_y))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.636667  1.000000  0.778004       191
  ECONOMIC PERFORMANCE   1.000000  0.620690  0.765957        58
             ELECTIONS   0.958333  0.418182  0.582278        55
                HEALTH   0.000000  0.000000  0.000000        36
              RELIGION   0.000000  0.000000  0.000000        11
                SPORTS   1.000000  0.947674  0.973134       172

             micro avg   0.789675  0.789675  0.789675       523
             macro avg   0.599167  0.497758  0.516562       523
          weighted avg   0.773062  0.789675  0.750342       523

[[191   0   0   0   0   0]
 [ 21  36   1   0   0   0]
 [ 32   0  23   0   0   0]
 [ 36   0   0   0   0   0]
 [ 11   0   0   0   0   0]
 [  9   0   0   0   0 163]]


  'precision', 'predicted', average, warn_for)
