In [657]:
import pandas as pd
import numpy as np
import math 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
data_train = open("data_train.txt")
label_train = open("labels_train_original.txt")
data_test = open("data_valid.txt")
label_test = open("labels_valid_original.txt")

# 1. Text Preprocessing 
In order to perform machine learning on text documents, we first need to turn the text content into numerical feature vectors.

## 1.1 replace text labels in training set to numerical labels

In [639]:
replace = {
    'News':0,
    'Classifieds':2,
    'Opinion':1,
    'Features':3
}
Y_train = [replace[y.rstrip('\n')] for y in label_train]

## 1.2 Tokenizing the artical in training set
CountVectorizer() builds a dictionary of features and transforms documents to feature vectors.
The index value of a word in the vocabulary is linked to its frequency in the whole training corpus.

In [658]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data_train)
X_train_counts.shape

(2000, 38749)

In [664]:
count_vect.vocabulary_

{'the': 34753,
 'sign': 31683,
 'in': 17827,
 'front': 14711,
 'of': 24546,
 'steepled': 33096,
 'church': 7538,
 'read': 28297,
 'sunday': 33765,
 'sermon': 31130,
 'does': 11269,
 'god': 15485,
 'prefer': 27041,
 'particular': 25538,
 'sports': 32753,
 'teams': 34484,
 'man': 21565,
 'passing': 25583,
 'by': 6331,
 'gives': 15353,
 'befuddled': 4704,
 'look': 21024,
 'scene': 30567,
 'is': 18771,
 'fact': 13357,
 'cartoon': 6777,
 'current': 9609,
 'new': 23894,
 'yorker': 38555,
 'but': 6300,
 'question': 27881,
 'good': 15543,
 'one': 24668,
 'since': 31776,
 'it': 18817,
 'has': 16425,
 'more': 23161,
 'or': 24779,
 'less': 20533,
 'been': 4682,
 'news': 23915,
 'guy': 16071,
 'checking': 7280,
 'out': 24952,
 'might': 22600,
 'very': 36949,
 'well': 37675,
 'have': 16483,
 'aware': 4041,
 'strange': 33335,
 'hesitates': 16800,
 'to': 35101,
 'say': 30506,
 'celestial': 6992,
 'goings': 15504,
 'on': 24663,
 'york': 38554,
 'and': 2963,
 'chicago': 7366,
 'recent': 28428,
 'weeks'

## 1.3 Term Frequency times Inverse Document Frequency
The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus.

In [641]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2000, 38749)

In [680]:
for line in X_train_tfidf:
    print(line)

  (0, 34753)	0.407993905251
  (0, 31683)	0.0732724785575
  (0, 17827)	0.184234954295
  (0, 14711)	0.0185728253255
  (0, 24546)	0.0669966840794
  (0, 33096)	0.0460469793258
  (0, 7538)	0.0238821268327
  (0, 28297)	0.0229293704348
  (0, 33765)	0.0408476413925
  (0, 31130)	0.0436860990701
  (0, 11269)	0.0157779654078
  (0, 15485)	0.0858116620325
  (0, 27041)	0.031578244706
  (0, 25538)	0.0277901028689
  (0, 32753)	0.0778767575344
  (0, 34484)	0.023040281353
  (0, 21565)	0.0188387458353
  (0, 25583)	0.0259589191781
  (0, 6331)	0.0332996145124
  (0, 15353)	0.0260520835201
  (0, 4704)	0.0460469793258
  (0, 21024)	0.0195007958791
  (0, 30567)	0.0265416698428
  (0, 18771)	0.0375225600159
  (0, 13357)	0.0406352593182
  :	:
  (0, 4902)	0.0436860990701
  (0, 16407)	0.033253315905
  (0, 25253)	0.0436860990701
  (0, 20762)	0.0335861291803
  (0, 4700)	0.013782156943
  (0, 2447)	0.0177500064851
  (0, 5599)	0.0351481357171
  (0, 34307)	0.0221108113409
  (0, 4746)	0.015634973708
  (0, 28281)	0.01379358

  (0, 34753)	0.184435617457
  (0, 17827)	0.0577891073972
  (0, 6331)	0.0365578970803
  (0, 16483)	0.0412433478157
  (0, 35101)	0.0280359555128
  (0, 14325)	0.0968954807793
  (0, 30243)	0.0463141900748
  (0, 34747)	0.0328361115782
  (0, 14708)	0.0796870009304
  (0, 5039)	0.093396290069
  (0, 2207)	0.0887646584603
  (0, 2923)	0.0394666564283
  (0, 38245)	0.0946225857048
  (0, 36323)	0.071041222311
  (0, 33020)	0.0702957026371
  (0, 22643)	0.097097191928
  (0, 19280)	0.0921573862008
  (0, 24547)	0.0682490495474
  (0, 31152)	0.0822547537217
  (0, 2859)	0.0664172015426
  (0, 10069)	0.135660727556
  (0, 8592)	0.100934170875
  (0, 32647)	0.106038179539
  (0, 36352)	0.117937291407
  (0, 9651)	0.0977338102089
  :	:
  (0, 14360)	0.101425907442
  (0, 592)	0.0835786477412
  (0, 14339)	0.113196621196
  (0, 37815)	0.086040399451
  (0, 27589)	0.146028273087
  (0, 31028)	0.217069495398
  (0, 3330)	0.124979441727
  (0, 15905)	0.106337241585
  (0, 17275)	0.081220340214
  (0, 35653)	0.126922364397
  (0, 

  (0, 34753)	0.017884374659
  (0, 17827)	0.0196129533846
  (0, 38521)	0.0518012392104
  (0, 2447)	0.105817502773
  (0, 1368)	0.178269182502
  (0, 25900)	0.0513035422234
  (0, 9885)	0.0451253644064
  (0, 49)	0.0946412447866
  (0, 23413)	0.0981822170706
  (0, 13649)	0.0577677356776
  (0, 27214)	0.074584555591
  (0, 38493)	0.0362423027451
  (0, 23109)	0.111348234303
  (0, 845)	0.0538060800845
  (0, 28212)	0.160106200137
  (0, 14806)	0.0832174717536
  (0, 34559)	0.0763092635627
  (0, 32128)	0.0794219034864
  (0, 5054)	0.184166715529
  (0, 1458)	0.23102535315
  (0, 940)	0.12522536434
  (0, 38581)	0.346538029725
  (0, 18115)	0.115512676575
  (0, 24255)	0.0794219034864
  (0, 5479)	0.189732527392
  :	:
  (0, 30293)	0.106157857017
  (0, 4401)	0.104768636334
  (0, 10922)	0.101164848947
  (0, 947)	0.13021837241
  (0, 1261)	0.12522536434
  (0, 955)	0.12522536434
  (0, 1342)	0.13021837241
  (0, 1048)	0.12522536434
  (0, 1741)	0.118188114713
  (0, 984)	0.12522536434
  (0, 999)	0.13021837241
  (0, 14

  (0, 34753)	0.197463322042
  (0, 17827)	0.0556839636163
  (0, 24546)	0.0801793056312
  (0, 28297)	0.0215608442611
  (0, 11269)	0.0148362666948
  (0, 21565)	0.0177143662179
  (0, 6331)	0.0313121463364
  (0, 30567)	0.0249575456743
  (0, 18771)	0.0564528764594
  (0, 23894)	0.0093058804467
  (0, 6300)	0.0514486756178
  (0, 15543)	0.0146723731404
  (0, 24668)	0.0196995856487
  (0, 18817)	0.0631999616574
  (0, 16425)	0.0183397585412
  (0, 23161)	0.0099232855497
  (0, 24779)	0.0101370374691
  (0, 23915)	0.0164620113148
  (0, 16071)	0.0434360714069
  (0, 24952)	0.0224359098313
  (0, 35101)	0.0720390951185
  (0, 30506)	0.0152934491291
  (0, 24663)	0.0791691431981
  (0, 2963)	0.143500910802
  (0, 37966)	0.0205712262593
  :	:
  (0, 35942)	0.0432986920753
  (0, 5257)	0.0410787195882
  (0, 14193)	0.0432986920753
  (0, 34495)	0.0432986920753
  (0, 10877)	0.0395036240421
  (0, 14210)	0.0432986920753
  (0, 11657)	0.0432986920753
  (0, 2038)	0.0432986920753
  (0, 35385)	0.0432986920753
  (0, 36563)	0.

## 1.4 Do the same things to the test set

In [642]:
Y_test = [replace[y.rstrip('\n')] for y in label_test]

In [643]:
X_test = list()
for line in data_test:
    line = str(line.rstrip())
    X_test.append(line)

In [644]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# 2. Classifications

## 2.1 Naive Bayes

In [675]:
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB(alpha=0.6).fit(X_train_tfidf, Y_train)
clf_NB

MultinomialNB(alpha=0.6, class_prior=None, fit_prior=True)

In [676]:
NB_train_predicted = clf_NB.predict(X_train_tfidf)

In [677]:
NB_train_CM = confusion_matrix(Y_train, NB_train_predicted)
print('Confusion Matrix of Training Set:\n',NB_train_CM)
print('Accuracy Rate of Training Set:\n',
      (NB_train_CM[0][0]+NB_train_CM[1][1]+NB_train_CM[2][2]+NB_train_CM[3][3])/2000)

Confusion Matrix of Training Set:
 [[466  13   9  16]
 [  9 465   4   7]
 [ 19   7 460  37]
 [ 22  26   1 439]]
Accuracy Rate of Training Set:
 0.915


In [678]:
NB_test_predicted = clf_NB.predict(X_test_tfidf)

In [679]:
NB_test_CM = confusion_matrix(Y_test, NB_test_predicted)
print('Confusion Matrix of Test Set:\n',NB_test_CM)
print('Accuracy Rate of Test Set:\n',(NB_test_CM[0][0]+NB_test_CM[1][1]+NB_test_CM[2][2]+NB_test_CM[3][3])/2000)

Confusion Matrix of Test Set:
 [[314  40  65  93]
 [ 25 414  16  52]
 [ 58  22 297  93]
 [ 52  82  80 297]]
Accuracy Rate of Test Set:
 0.661


## 2.2 Logistic Regression 

In [258]:
from sklearn.linear_model import LogisticRegression
clf_LR = LogisticRegression(random_state=0,C=75).fit(X_train_tfidf, Y_train)
clf_LR

LogisticRegression(C=75, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [259]:
LR_train_predicted = clf_LR.predict(X_train_tfidf)

In [260]:
LR_train_CM = confusion_matrix(Y_train, LR_train_predicted)
print('Confusion Matrix of Training Set:\n',LR_train_CM)
print('Accuracy Rate of Training Set:\n',
      (LR_train_CM[0][0]+LR_train_CM[1][1]+LR_train_CM[2][2]+LR_train_CM[3][3])/2000)

Confusion Matrix of Training Set:
 [[504   0   0   0]
 [  0 485   0   0]
 [  0   0 523   0]
 [  0   0   0 488]]
Accuracy Rate of Training Set:
 1.0


In [261]:
LR_test_predicted = clf_LR.predict(X_test_tfidf)

In [262]:
LR_test_CM = confusion_matrix(Y_test, LR_test_predicted)
print('Confusion Matrix of Test Set:\n',LR_test_CM)
print('Accuracy Rate of Test Set:\n',(LR_test_CM[0][0]+LR_test_CM[1][1]+LR_test_CM[2][2]+LR_test_CM[3][3])/2000)

Confusion Matrix of Test Set:
 [[328  41  67  76]
 [ 16 438  20  33]
 [ 41  14 350  65]
 [ 60  77  71 303]]
Accuracy Rate of Test Set:
 0.7095


In [178]:
clf_LR.predict_proba(X_train_tfidf)

array([[ 0.68663022,  0.05472722,  0.07699753,  0.18164503],
       [ 0.19221658,  0.02844145,  0.42615145,  0.35319053],
       [ 0.10916873,  0.57675964,  0.14141191,  0.17265972],
       ..., 
       [ 0.69857593,  0.06051578,  0.11201818,  0.12889011],
       [ 0.29064977,  0.0860766 ,  0.39240516,  0.23086847],
       [ 0.69404896,  0.06398381,  0.09592902,  0.14603821]])

## 2.3 SVM ( SVC with Linear Kernel)

In [497]:
from sklearn import svm
clf_svm1 = svm.SVC(decision_function_shape='ovo',kernel='linear',C=2).fit(X_train_tfidf, Y_train)
clf_svm1

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [498]:
svm1_train_predicted = clf_svm1.predict(X_train_tfidf)

In [499]:
svm1_train_CM = confusion_matrix(Y_train, svm1_train_predicted)
print('Confusion Matrix of Training Set:\n',svm1_train_CM)
print('Accuracy Rate of Training Set:\n',
      (svm1_train_CM[0][0]+svm1_train_CM[1][1]+svm1_train_CM[2][2]+svm1_train_CM[3][3])/2000)

Confusion Matrix of Training Set:
 [[503   0   1   0]
 [  0 485   0   0]
 [  0   0 523   0]
 [  3   0   0 485]]
Accuracy Rate of Training Set:
 0.998


In [500]:
svm1_test_predicted = clf_svm1.predict(X_test_tfidf)

In [501]:
svm1_test_CM = confusion_matrix(Y_test, svm1_test_predicted)
print('Confusion Matrix of Test Set:\n',svm1_test_CM)
print('Accuracy Rate of Test Set:\n',(svm1_test_CM[0][0]+svm1_test_CM[1][1]+svm1_test_CM[2][2]+svm1_test_CM[3][3])/2000)

Confusion Matrix of Test Set:
 [[324  37  67  84]
 [ 15 435  20  37]
 [ 44  13 343  70]
 [ 57  69  70 315]]
Accuracy Rate of Test Set:
 0.7085


## 2.4 SVM ( Linear SVC)

### Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

In [532]:
clf_svm2 = svm.LinearSVC().fit(X_train_tfidf, Y_train)
clf_svm2

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [533]:
svm2_train_predicted = clf_svm2.predict(X_train_tfidf)

In [534]:
svm2_train_CM = confusion_matrix(Y_train, svm2_train_predicted)
print('Confusion Matrix of Training Set:\n',svm2_train_CM)
print('Accuracy Rate of Training Set:\n',
      (svm2_train_CM[0][0]+svm2_train_CM[1][1]+svm2_train_CM[2][2]+svm2_train_CM[3][3])/2000)

Confusion Matrix of Training Set:
 [[504   0   0   0]
 [  0 485   0   0]
 [  0   0 523   0]
 [  1   0   0 487]]
Accuracy Rate of Training Set:
 0.9995


In [535]:
svm2_test_predicted = clf_svm2.predict(X_test_tfidf)

In [536]:
svm2_test_CM = confusion_matrix(Y_test, svm2_test_predicted)
print('Confusion Matrix of Test Set:\n',svm2_test_CM)
print('Accuracy Rate of Test Set:\n',(svm2_test_CM[0][0]+svm2_test_CM[1][1]+svm2_test_CM[2][2]+svm2_test_CM[3][3])/2000)

Confusion Matrix of Test Set:
 [[331  43  65  73]
 [ 15 443  19  30]
 [ 42  17 341  70]
 [ 55  81  71 304]]
Accuracy Rate of Test Set:
 0.7095


## 2.5 KNN

In [633]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=15,weights='distance').fit(X_train_tfidf, Y_train)

In [634]:
neigh_predicted = neigh.predict(X_train_tfidf)

In [635]:
neigh_train_CM = confusion_matrix(Y_train, neigh_predicted)
print('Confusion Matrix of Training Set:\n',neigh_train_CM)
print('Accuracy Rate of Training Set:\n',
      (neigh_train_CM[0][0]+neigh_train_CM[1][1]+neigh_train_CM[2][2]+neigh_train_CM[3][3])/2000)

Confusion Matrix of Training Set:
 [[504   0   0   0]
 [  0 485   0   0]
 [  0   0 523   0]
 [  0   0   0 488]]
Accuracy Rate of Training Set:
 1.0


In [636]:
neigh_test_predicted = neigh.predict(X_test_tfidf)

In [637]:
neigh_test_CM = confusion_matrix(Y_test, neigh_test_predicted)
print('Confusion Matrix of Test Set:\n',neigh_test_CM)
print('Accuracy Rate of Test Set:\n',
      (neigh_test_CM[0][0]+neigh_test_CM[1][1]+neigh_test_CM[2][2]+neigh_test_CM[3][3])/2000)

Confusion Matrix of Test Set:
 [[299  31 135  47]
 [ 38 274 127  68]
 [ 43  19 365  43]
 [ 78  36 177 220]]
Accuracy Rate of Test Set:
 0.579
