# HW 07

## Task 01: Import Data/EDA

In [1]:
# read in the training data

# the data set includes four columns: PhraseId, SentenceId, Phrase, Sentiment
# In this data set a sentence is further split into phrases 
# in order to build a sentiment classification model
# that can not only predict sentiment of sentences but also shorter phrases

# A data example:
# PhraseId SentenceId Phrase Sentiment
# 1 1 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .1

# the Phrase column includes the training examples
# the Sentiment column includes the training labels
# "0" for very negative
# "1" for negative
# "2" for neutral
# "3" for positive
# "4" for very positive

import numpy as np
import pandas as p
train=p.read_csv("D:\Darrell\Desktop\Syracuse\Summer_19\IST_736_Text_Mining\Week06\kaggle-sentiment\kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

In [2]:
# check the sklearn documentation for train_test_split
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# "test_size" : float, int, None, optional
# If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
# If int, represents the absolute number of test samples. 
# If None, the value is set to the complement of the train size. 
# By default, the value is set to 0.25. The default will change in version 0.21. It will remain 0.25 only if train_size is unspecified, otherwise it will complement the specified train_size.    

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[0]) # printing out 1st example in X_train
print(y_train[0])
print(X_test[0])
print(y_test[0])

(93636,) (93636,) (62424,) (62424,)
almost in a class with that of Wilde
3
escape movie
2


In [3]:
# Check how many training examples in each category
# this is important to see whether the data set is balanced or skewed

unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)))

[[    0     1     2     3     4]
 [ 4141 16449 47718 19859  5469]]


In [4]:
# Print out the category distribution in the test data set. 
#Is the test data set's category distribution similar to the training data set's?

unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)))

[[    0     1     2     3     4]
 [ 2931 10824 31864 13068  3737]]


## Task 01: Build a unigram MNB and SVM model

In [5]:
# sklearn contains two vectorizers

# CountVectorizer can give you Boolean or TF vectors
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

# TfidfVectorizer can give you TF or TFIDF vectors
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# Read the sklearn documentation to understand all vectorization options

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# several commonly used vectorizer setting

#  unigram boolean vectorizer, set minimum document frequency to 5
# generic encoding is utf8, for this exercise to capture all words/tokens latin-1 was chosen
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')


In [6]:
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary
# These two processes can be done together using fit_transform(), or used individually: fit() or transform()

# fit vocabulary in training documents and transform the training documents into vectors
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)

# check the content of a document vector
print(X_train_vec.shape)
print(X_train_vec[0].toarray())

# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_count_vectorizer.vocabulary_.items())[:10])

(93636, 11967)
[[0 0 0 ... 0 0 0]]
11967
[('class', 1858), ('wilde', 11742), ('derring', 2802), ('chilling', 1764), ('affecting', 313), ('meanspirited', 6557), ('personal', 7662), ('low', 6296), ('involved', 5602), ('worth', 11868)]


In [7]:
# use the vocabulary constructed from the training data to vectorize the test data. 
# Therefore, use "transform" only, not "fit_transform", 
# otherwise "fit" would generate a new vocabulary from the test data
# any vocabulary NOT in the training data will be ignored in the testing data

X_test_vec = unigram_count_vectorizer.transform(X_test)

# print out #examples and #features in the test set
print(X_test_vec.shape)

(62424, 11967)


In [8]:
# Your code starts here
unigram_bool_vectorizer1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

X_train_vec1 = unigram_bool_vectorizer1.fit_transform(X_train)
X_test_vec1 = unigram_bool_vectorizer1.transform(X_test)

# check the content of a document vector
print(X_train_vec1.shape)
print(X_test_vec1.shape)

(93636, 11967)
(62424, 11967)


In [9]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
# Most negative/positive words in the MNB model
feature_ranks1 = sorted(zip(nb_clf.feature_log_prob_[4] / nb_clf.feature_log_prob_[0], unigram_count_vectorizer.get_feature_names()))
very_negative_features1 = feature_ranks1[-10:]
print('Top 10 most negative words in MNB:\n',very_negative_features1)

very_positive_features1 = feature_ranks1[:10]
print('Top 10 most positive words in MNB:\n', very_positive_features1)


Top 10 most negative words in MNB:
 [(1.5210602737939296, 'waste'), (1.523562078883295, 'minutes'), (1.533975977581715, 'poorly'), (1.5402375494766518, 'awful'), (1.5463767818167982, 'contrived'), (1.5463767818167982, 'unfunny'), (1.5698200666534547, 'worse'), (1.5969581857419417, 'stupid'), (1.7858251634181175, 'worst'), (1.857793692114055, 'bad')]
Top 10 most positive words in MNB:
 [(0.5958791031947216, 'moving'), (0.604612353536404, 'beautiful'), (0.6156827739590933, 'beautifully'), (0.6217403853248004, 'powerful'), (0.6249198570600717, 'solid'), (0.6298970899440408, 'touching'), (0.6333656656276786, 'gorgeous'), (0.6369651614685079, 'excellent'), (0.6420878254266654, 'best'), (0.6528979981733687, 'wonderful')]


In [11]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4])
print(cm)
# first row/column is very negative
# last row/column is very positive
#row is predicted (x-axis)
#column is actual (y-axis)

[[  742  1276   797   105    11]
 [  614  4126  5397   655    32]
 [  248  2385 25756  3239   236]
 [   19   456  5570  6253   770]
 [    1    53   729  1977   977]]


In [12]:
# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['0','1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names))

[0.45689655 0.49734812 0.67337708 0.51132554 0.482231  ]
[0.25315592 0.38118995 0.80831032 0.47849709 0.26143966]
              precision    recall  f1-score   support

           0       0.46      0.25      0.33      2931
           1       0.50      0.38      0.43     10824
           2       0.67      0.81      0.73     31864
           3       0.51      0.48      0.49     13068
           4       0.48      0.26      0.34      3737

   micro avg       0.61      0.61      0.61     62424
   macro avg       0.52      0.44      0.47     62424
weighted avg       0.59      0.61      0.59     62424



In [13]:
# import the LinearSVC module
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [14]:
## Linear SVC also ranks all features based on their contribution to distinguish the two concepts in each binary classifier
## For category "0" (very negative), get all features and their weights and sort them in increasing order
feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_negative_10 = feature_ranks[-10:]
print("Top Very negative words in SVM (SVC) model")
for i in range(0, len(very_negative_10)):
    print(very_negative_10[i])
print()

# Output most positive words
feature_ranks2 = sorted(zip(svm_clf.coef_[4], unigram_count_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_positive_10 = feature_ranks2[-10:]
print("Top Very positive words in SVM (SVC) model")
for i in range(0, len(very_positive_10)):
    print(very_positive_10[i])
print()

Top Very negative words in SVM (SVC) model
(1.6216100498637946, 'cesspool')
(1.6484881169807253, 'disappointment')
(1.6592495317420688, 'pompous')
(1.6683696811106015, 'stinks')
(1.692774017797078, 'distasteful')
(1.6955904814661282, 'unwatchable')
(1.7526397947043106, 'unbearable')
(1.7873567368832495, 'stinker')
(1.8228705762137276, 'disgusting')
(1.823305541733355, 'worthless')

Top Very positive words in SVM (SVC) model
(1.5635285560162435, 'stunning')
(1.6005795112206929, 'astonish')
(1.6108129117317336, 'refreshes')
(1.6148904549660266, 'flawless')
(1.6474646629644183, 'phenomenal')
(1.6506424842957124, 'masterful')
(1.6776155730733564, 'masterfully')
(1.8781421347349103, 'glorious')
(1.980188264630256, 'miraculous')
(2.0143252025665195, 'perfection')



In [15]:
# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4])
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['0','1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names))

[[  918  1221   697    82    13]
 [  701  4080  5504   514    25]
 [  195  2106 27081  2310   172]
 [   34   396  6048  5533  1057]
 [    3    51   590  1772  1321]]

              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.75     31864
           3       0.54      0.42      0.48     13068
           4       0.51      0.35      0.42      3737

   micro avg       0.62      0.62      0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424



In [16]:
# test the classifier on the test data set, print accuracy score
a = nb_clf.score(X_test_vec,y_test)
b = svm_clf.score(X_test_vec,y_test)
print('MNB accuracy: ', a,'\nSVM (SVC) accuracy:', b)

MNB accuracy:  0.606401384083045 
SVM (SVC) accuracy: 0.6236864026656415


## Task 02: MNB and SVM model w/both unigram and bigram

In [17]:
#gram12_count_vectorizer

X_train_vec2 = gram12_count_vectorizer.fit_transform(X_train)

# check the content of a document vector
print(X_train_vec2.shape)
print(X_train_vec2[0].toarray())

# check the size of the constructed vocabulary
print(len(gram12_count_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(gram12_count_vectorizer.vocabulary_.items())[:10])

(93636, 34579)
[[0 0 0 ... 0 0 0]]
34579
[('class', 5020), ('wilde', 33787), ('derring', 7552), ('chilling', 4755), ('affecting', 825), ('meanspirited', 19199), ('personal', 22506), ('low', 18281), ('involved', 15905), ('worth', 34261)]


In [18]:
X_test_vec2 = gram12_count_vectorizer.transform(X_test)

# print out #examples and #features in the test set
print(X_test_vec2.shape)

(62424, 34579)


In [19]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf2= MultinomialNB()

# use the training data to train the MNB model
nb_clf2.fit(X_train_vec2,y_train)

# Most negative/positive words in the MNB model
feature_ranks2 = sorted(zip(nb_clf2.feature_log_prob_[4] / nb_clf2.feature_log_prob_[0], gram12_count_vectorizer.get_feature_names()))
very_negative_features2 = feature_ranks2[-10:]
print('Top 10 most negative words in MNB with bigrams:\n',very_negative_features2)

very_positive_features2 = feature_ranks2[:10]
print('Top 10 most positive words in MNB with bigrams:\n', very_positive_features2)


Top 10 most negative words in MNB with bigrams:
 [(1.4625998317179831, 'minutes'), (1.4688024469565653, 'waste'), (1.480147801869689, 'poorly'), (1.4856418547778594, 'awful'), (1.491024633260782, 'contrived'), (1.491024633260782, 'unfunny'), (1.5115435903246601, 'worse'), (1.5352259970408633, 'stupid'), (1.6979772129256334, 'worst'), (1.7460508461133102, 'bad')]
Top 10 most positive words in MNB with bigrams:
 [(0.6169544589100915, 'moving'), (0.625149851129605, 'beautiful'), (0.6355384732175152, 'beautifully'), (0.6412230120905631, 'powerful'), (0.6442066684501019, 'solid'), (0.6488773665357048, 'touching'), (0.6521323216850844, 'gorgeous'), (0.6555101339604645, 'excellent'), (0.6655311672722624, 'best'), (0.6704617087873739, 'wonderful')]


In [20]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred2 = nb_clf2.fit(X_train_vec2, y_train).predict(X_test_vec2)
cm2=confusion_matrix(y_test, y_pred2, labels=[0,1,2,3,4])
print(cm2)
# first row/column is very negative
# last row/column is very positive
#row is predicted (x-axis)
#column is actual (y-axis)

[[  867  1253   725    69    17]
 [  786  4440  4943   609    46]
 [  459  2961 24437  3600   407]
 [   41   513  5082  6375  1057]
 [    6    46   602  1911  1172]]


In [21]:
# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred2, average=None))
print(recall_score(y_test, y_pred2, average=None))

from sklearn.metrics import classification_report
target_names = ['0','1','2','3','4']
print(classification_report(y_test, y_pred2, target_names=target_names))

[0.4015748  0.48192771 0.68280757 0.5074021  0.4342349 ]
[0.29580348 0.41019956 0.76691564 0.48783287 0.31362055]
              precision    recall  f1-score   support

           0       0.40      0.30      0.34      2931
           1       0.48      0.41      0.44     10824
           2       0.68      0.77      0.72     31864
           3       0.51      0.49      0.50     13068
           4       0.43      0.31      0.36      3737

   micro avg       0.60      0.60      0.60     62424
   macro avg       0.50      0.45      0.47     62424
weighted avg       0.58      0.60      0.59     62424



In [22]:
# import the LinearSVC module
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf2 = LinearSVC(C=1)

# use the training data to train the model
svm_clf2.fit(X_train_vec2,y_train)

## Linear SVC also ranks all features based on their contribution to distinguish the two concepts in each binary classifier
## For category "0" (very negative), get all features and their weights and sort them in increasing order
feature_ranks2 = sorted(zip(svm_clf2.coef_[0], gram12_count_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_negative_10 = feature_ranks2[-10:]
print("Top Very negative words/word pairs in SVM (SVC) model")
for i in range(0, len(very_negative_10)):
    print(very_negative_10[i])
print()

# Output most positive words
feature_ranks3 = sorted(zip(svm_clf2.coef_[4], gram12_count_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_positive_10 = feature_ranks3[-10:]
print("Top Very positive words/word pairs in SVM (SVC) model")
for i in range(0, len(very_positive_10)):
    print(very_positive_10[i])
print()

Top Very negative words/word pairs in SVM (SVC) model
(1.73943762863937, 'charm laughs')
(1.7467994776878255, 'unappealing')
(1.7584451723161383, 'unwatchable')
(1.7990447935149583, 'unbearable')
(1.8031737089973476, 'waste')
(1.8061699271532758, 'utterly incompetent')
(1.8574085125019189, 'disgusting')
(1.918245634598836, 'distasteful')
(1.9598713384460655, 'pompous')
(1.9628015375368904, 'garbage')

Top Very positive words/word pairs in SVM (SVC) model
(1.651780329327719, 'masterful')
(1.6642264040041055, 'glorious')
(1.6946065602147025, 'flawless')
(1.7364395750683885, 'masterfully')
(1.738277853018173, 'gem')
(1.744519740734703, 'miraculous')
(1.8078519838505431, 'cut rest')
(1.8597827705487435, 'amazing')
(2.022840068620278, 'masterpiece')
(2.1269100417311484, 'perfection')





In [23]:
# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred3 = svm_clf2.predict(X_test_vec2)
cm3=confusion_matrix(y_test, y_pred3, labels=[0,1,2,3,4])
print(cm3)
print()

from sklearn.metrics import classification_report
target_names = ['0','1','2','3','4']
print(classification_report(y_test, y_pred3, target_names=target_names))

[[ 1039  1276   542    63    11]
 [  864  4555  4911   457    37]
 [  252  2470 26246  2700   196]
 [   28   358  5383  6034  1265]
 [    5    27   452  1794  1459]]

              precision    recall  f1-score   support

           0       0.47      0.35      0.41      2931
           1       0.52      0.42      0.47     10824
           2       0.70      0.82      0.76     31864
           3       0.55      0.46      0.50     13068
           4       0.49      0.39      0.44      3737

   micro avg       0.63      0.63      0.63     62424
   macro avg       0.55      0.49      0.51     62424
weighted avg       0.61      0.63      0.62     62424



In [24]:
# test the classifier on the test data set, print accuracy score
c = nb_clf2.score(X_test_vec2,y_test)
d = svm_clf2.score(X_test_vec2,y_test)
print('MNB accuracy: ', c,'\nSVM (SVC) accuracy:', d)

MNB accuracy:  0.5973824170190952 
SVM (SVC) accuracy: 0.6300941945405614


## Task 03: Build best SVM model w/full dataset

In [30]:
# Use all data (X,y) and run CV 
# Remove stop words and use tf_idf for bigrams vectorizing

#  unigram and bigram tfidf vectorizer, set minimum document frequency to 5
gram12_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', ngram_range=(1,2), use_idf=True, min_df=5, stop_words='english')

X_train_vec_final = gram12_tfidf_vectorizer.fit_transform(X)


  (0, 46380)	0.16737955507623156
  (0, 22645)	0.26459608479079383
  (0, 19177)	0.2360666593708908
  (0, 10041)	0.2385632896625903
  (0, 27250)	0.2510782596535469
  (0, 27411)	0.22213842464757658
  (0, 26537)	0.25249152438590455
  (0, 38291)	0.18439933178853068
  (0, 10764)	0.25023304756922315
  (0, 10735)	0.2096421847250389
  (0, 49268)	0.12213331756475067
  (0, 19178)	0.26094331928998066
  (0, 10042)	0.26459608479079383
  (0, 27297)	0.25249152438590455
  (0, 26538)	0.26094331928998066
  (0, 38292)	0.254988154677604
  (0, 10765)	0.25249152438590455
  (0, 10741)	0.2428835942727147
  (1, 46380)	0.24219100048341147
  (1, 22645)	0.38285912798782373
  (1, 19177)	0.3415782793052216
  (1, 10041)	0.345190795707888
  (1, 27250)	0.18164970888299772
  (1, 27411)	0.32142472410506034
  (1, 19178)	0.3775737337784156
  :	:
  (156053, 3600)	0.32120156636202024
  (156053, 11848)	0.34680396980792555
  (156053, 15463)	0.34680396980792555
  (156053, 12588)	0.3590745138955437
  (156053, 3601)	0.35555875721

In [42]:
# import the LinearSVC module
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf_final = LinearSVC(C=5)

# use the training data to train the model
svm_clf_final.fit(X_train_vec_final,y)

## Linear SVC also ranks all features based on their contribution to distinguish the two concepts in each binary classifier
## For category "0" (very negative), get all features and their weights and sort them in increasing order
feature_ranks_final = sorted(zip(svm_clf_final.coef_[0], gram12_tfidf_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_negative_10_final = feature_ranks_final[-10:]
print("Top Very negative words/word pairs in SVM (SVC) model")
for i in range(0, len(very_negative_10_final)):
    print(very_negative_10_final[i])
print()

# Output most positive words
feature_ranks_final2 = sorted(zip(svm_clf_final.coef_[4], gram12_tfidf_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_positive_10_final = feature_ranks3[-10:]
print("Top Very positive words/word pairs in SVM (SVC) model")
for i in range(0, len(very_positive_10_final)):
    print(very_positive_10_final[i])
print()

Top Very negative words/word pairs in SVM (SVC) model
(3.2082422129261308, 'pathetic')
(3.2215280338111936, 'basketball teams')
(3.228181220179103, 'utterly incompetent')
(3.271839748666288, 'paper bag')
(3.3126557201266467, 'unbearable')
(3.322777143463517, 'movie contrived')
(3.3531562605706466, 'Skip')
(3.3948479865391876, 'movie titled')
(3.5915721464868517, 'disappointment')
(3.8838824149766777, 'admit walked')

Top Very positive words/word pairs in SVM (SVC) model
(1.651780329327719, 'masterful')
(1.6642264040041055, 'glorious')
(1.6946065602147025, 'flawless')
(1.7364395750683885, 'masterfully')
(1.738277853018173, 'gem')
(1.744519740734703, 'miraculous')
(1.8078519838505431, 'cut rest')
(1.8597827705487435, 'amazing')
(2.022840068620278, 'masterpiece')
(2.1269100417311484, 'perfection')



In [43]:
# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred_final = svm_clf_final.predict(X_train_vec_final)
cm_final=confusion_matrix(y, y_pred_final, labels=[0,1,2,3,4])
print(cm_final)

from sklearn.metrics import classification_report
target_names = ['0','1','2','3','4']
print(classification_report(y, y_pred_final, target_names=target_names))

[[ 4439  2104   475    48     6]
 [ 1117 19830  5810   496    20]
 [  317  3789 71367  3920   189]
 [   41   480  6674 24296  1436]
 [    8    32   429  2863  5874]]
              precision    recall  f1-score   support

           0       0.75      0.63      0.68      7072
           1       0.76      0.73      0.74     27273
           2       0.84      0.90      0.87     79582
           3       0.77      0.74      0.75     32927
           4       0.78      0.64      0.70      9206

   micro avg       0.81      0.81      0.81    156060
   macro avg       0.78      0.73      0.75    156060
weighted avg       0.80      0.81      0.80    156060



In [44]:
# test the classifier on the test data set, print accuracy score
y_pred_final = svm_clf_final.predict(X_train_vec_final)

e = svm_clf_final.score(X_train_vec_final,y)
print('SVM using tf_idf accuracy: ', e)

SVM using tf_idf accuracy:  0.8061386646161732


In [45]:
# cross validation
cslist = [] # empty list
for x in range(1, 10):
    i = svm_clf_final.score(X_train_vec_final,y)
    cslist.append(i)
    print("Iteration accuracy:" , i)
avg=sum(cslist)/len(cslist)
print(avg)

Iteration accuracy: 0.8061386646161732
Iteration accuracy: 0.8061386646161732
Iteration accuracy: 0.8061386646161732
Iteration accuracy: 0.8061386646161732
Iteration accuracy: 0.8061386646161732
Iteration accuracy: 0.8061386646161732
Iteration accuracy: 0.8061386646161732
Iteration accuracy: 0.8061386646161732
Iteration accuracy: 0.8061386646161732
0.8061386646161733
