# Intro To ML Project

### Comparing the Performance of Various Classification Algorithms for Classifying Text into Categories
___

The purpose of this project is to compare the performance of various classification algorithms for classifying text into categories. The classification algorithms include:

- Multinomial Naive Bayes
- Logistic Regression
- Decision Tree
- Perceptron
- Support Vector Machines

In [0]:
# Imports
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from pprint import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import Perceptron
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [32]:
# Load the twenty_train dataset containing the newsgroup classes
categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey',]
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove=('headers', 'footers', 'quotes'))

pprint(list(twenty_train.target_names))

['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']


#Extract Features from Text

In [33]:
# Preprocess, filter and remove stopword with Count Vectorizer
# And count occurencies of each word

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2389, 21540)

In [34]:
# Use frequencies instead of occurencies as longer documents have higher occurencies than shorter documents
tf_transformer = TfidfTransformer()
X_train_tf = tf_transformer.fit_transform(X_train_counts)
X_train_tf.shape

(2389, 21540)

Let's define some useful functions

In [0]:

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [0]:
def plot_auc_curve(y_test, predicted):
  
  # Compute ROC curve and ROC AUC for each class
  n_classes = 4
  fpr = dict()
  tpr = dict()
  roc_auc = dict()
  all_y_test_i = np.array([])
  all_y_predict_proba = np.array([])
  for i in range(n_classes):
      y_test_i = map(lambda x: 1 if x == i else 0, y_test)
      all_y_test_i = np.concatenate([all_y_test_i, y_test_i])
      all_y_predict_proba = np.concatenate([all_y_predict_proba, y_predict_proba[:, i]])
      fpr[i], tpr[i], _ = roc_curve(y_test_i, y_predict_proba[:, i])
      roc_auc[i] = auc(fpr[i], tpr[i])

  # Compute micro-average ROC curve and ROC area
  fpr["average"], tpr["average"], _ = roc_curve(all_y_test_i, all_y_predict_proba)
  roc_auc["average"] = auc(fpr["average"], tpr["average"])


  # Plot average ROC Curve
  plt.figure()
  plt.plot(fpr["average"], tpr["average"],
           label='Average ROC curve (area = {0:0.2f})'
                 ''.format(roc_auc["average"]),
           color='deeppink', linestyle=':', linewidth=4)

  # Plot each individual ROC curve
  for i in range(n_classes):
      plt.plot(fpr[i], tpr[i], lw=2,
               label='ROC curve of class {0} (area = {1:0.2f})'
               ''.format(i, roc_auc[i]))

  plt.plot([0, 1], [0, 1], 'k--', lw=2)
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Some extension of Receiver operating characteristic to multi-class')
  plt.legend(loc="lower right")
  plt.show()

# Training Classifiers

We start with a Naive Bayes Classifier

In [0]:
naive_bayes_clf = MultinomialNB().fit(X_train_tf, twenty_train.target)

#plot_learning_curve(naive_bayes_clf, "Naive Bayes Learning Curve", X_train_tf, twenty_train.target)

In [38]:
# Let's see the performance of the Naive Bayes Classifier on our subset
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
test = twenty_test.data
X_test_counts = count_vect.transform(test)
X_test_tf = tf_transformer.transform(X_test_counts)
predicted = naive_bayes_clf.predict(X_test_tf)

print("Test accuracy:" + str(np.mean(predicted == twenty_test.target))) 
print("Confusion Matrix:\n")
print(confusion_matrix(twenty_test.target, predicted))  
print("Classification Report:\n")
print(classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))  


Test accuracy:0.03717472118959108
Confusion Matrix:

[[147  59  64  49   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [232  87  33  37   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [281  61  20  32   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [302  53  11  26   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [242  70  21  52   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [257  77  18  43   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [186 103  20  81   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [382   6   2   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 24 368   4   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  3   1 358  35   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   2   2 395   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [287  49  2

  'precision', 'predicted', average, warn_for)


Now, We try a Logistic Regression Model with Thresholding for classifying text

In [0]:
log_regression_clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train_tf, twenty_train.target)

In [40]:
# Let's see the performance of the Logistic Regression Classifier on our subset
test = twenty_test.data
X_test_counts = count_vect.transform(test)
X_test_tf = tf_transformer.transform(X_test_counts)
predicted = log_regression_clf.predict(X_test_tf)

print("Test accuracy:" + str(np.mean(predicted == twenty_test.target))) 
print("Confusion Matrix:\n")
print(confusion_matrix(twenty_test.target, predicted))  
print("Classification Report:\n")
print(classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)) 

Test accuracy:0.03983005841741901
Confusion Matrix:

[[117 138  53  11   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [195 156  33   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [199 173  19   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [266 107  11   8   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [222 128  26   9   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [216 164  14   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [155 192  28  15   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [372  17   7   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 17 377   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  5   5 363  24   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  1   7   8 383   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [234 116  3

  'precision', 'predicted', average, warn_for)


Now Let's try using a Perceptron for Classification

In [41]:
perceptron_clf = Perceptron(tol=1e-6, random_state=0)
perceptron_clf.fit(X_train_tf, twenty_train.target)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=None, n_iter=None, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=1e-06, verbose=0, warm_start=False)

In [42]:
# Let's see the performance of the Perceptron Classifier on our test
test = twenty_test.data
X_test_counts = count_vect.transform(test)
X_test_tf = tf_transformer.transform(X_test_counts)
predicted = perceptron_clf.predict(X_test_tf)

print("Test accuracy:" + str(np.mean(predicted == twenty_test.target))) 
print("Confusion Matrix:\n")
print(confusion_matrix(twenty_test.target, predicted))  
print("Classification Report:\n")
print(classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)) 

Test accuracy:0.03903345724907063
Confusion Matrix:

[[128  81  83  27   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [203 100  71  15   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [216 113  57   8   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [280  65  38   9   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [254  65  47  19   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [246 106  36   7   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [202 122  41  25   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [365  17  12   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 35 346  15   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 10  10 359  18   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 12   4  14 369   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [250  62  5

  'precision', 'predicted', average, warn_for)


Finally, we'll use a decision tree to classify news group

In [43]:
decision_tree_clf = AdaBoostClassifier(DecisionTreeClassifier(),
                          n_estimators=300, random_state=0)
decision_tree_clf.fit(X_train_tf, twenty_train.target)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=300, random_state=0)

In [44]:
# Let's see the performance of the Decision Tree Classifier on our test
test = twenty_test.data
X_test_counts = count_vect.transform(test)
X_test_tf = tf_transformer.transform(X_test_counts)
predicted = decision_tree_clf.predict(X_test_tf)

print("Test accuracy:" + str(np.mean(predicted == twenty_test.target))) 
print("Confusion Matrix:\n")
print(confusion_matrix(twenty_test.target, predicted))  
print("Classification Report:\n")
print(classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)) 

print(twenty_test.target)

Test accuracy:0.04899097185342539
Confusion Matrix:

[[163  89  53  14   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [180 140  46  23   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [188 147  45  14   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [200 131  40  21   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [197 124  41  23   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [184 152  30  29   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [103 159  61  67   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [284  81  24   7   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 71 308  17   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 72  54 216  55   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 62  36 101 200   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [165 160  5

  'precision', 'predicted', average, warn_for)


In [28]:
n_classes = 20
plot_auc_curve(twenty_test.target, predicted)

ValueError: ignored