In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
import numpy as np



In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

In [3]:
from sklearn.model_selection import KFold

In [4]:
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

# Plotting config
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
from sklearn.pipeline import Pipeline

In [6]:
from sklearn.metrics import classification_report

# Данные

In [7]:
reviews = pd.read_csv('stemmed.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
reviews.head()

Unnamed: 0,author,author_id,date,movie_id,movie_name,numOfReviews,rating,region,review_id,review_name,review_text,useful,useful_of,category,stemmed
0,carflo,,26 November 2003,tt0111161,The Shawshank Redemption,4857,10.0,Texas,349418,Tied for the best movie I have ever seen,why do i want to write the 234th comment on t...,2275.0,2611.0,1,whi do i want to write the 234th comment on t...
1,Wesley S. Walker,,27 August 2002,tt0111161,The Shawshank Redemption,4857,10.0,"Paducah, KY",349147,Shawshank Redeems Hollywood,"can hollywood, usually creating things for e...",1467.0,1712.0,1,"can hollywood, usual creat thing for enterta..."
2,Justin M (kaspen12),,10 February 2006,tt0111161,The Shawshank Redemption,4857,10.0,"Vancouver, Canada",1288098,A classic piece of unforgettable film-making.,"in its oscar year, shawshank redemption (writ...",896.0,1015.0,1,"in it oscar year, shawshank redempt (written ..."
3,Si Cole,,3 August 2001,tt0111161,The Shawshank Redemption,4857,8.0,,348829,The best story ever told on film,i believe that this film is the best story e...,891.0,1205.0,1,i believ that thi film is the best stori eve...
4,Thomas McFadden (tmac4),,25 July 2001,tt0111161,The Shawshank Redemption,4857,10.0,"Houston, Texas",348821,Powerful,one of my all time favorites. shawshank rede...,706.0,859.0,1,one of my all time favorites. shawshank rede...


In [9]:
reviews.shape

(402356, 15)

In [10]:
S = np.argsort(reviews.rating.value_counts().index)
ratingCounts = reviews.rating.value_counts().values[S]
proba = map(lambda x: 1.0/x, ratingCounts)
print proba
row_proba = map(lambda x: proba[int(x)-1], reviews.rating)
row_proba /= sum(row_proba)
idx = (np.random.choice(reviews.index, size=100000, replace=False, p=row_proba))
reviewsNormed = reviews.loc[idx,:]

[3.0507337014552e-05, 7.265329845975008e-05, 6.415191172696946e-05, 6.89464975179261e-05, 5.0461724781753044e-05, 4.39734400422145e-05, 3.294458720432233e-05, 2.1918769041930604e-05, 1.607665348381081e-05, 6.89731280693042e-06]


In [11]:
BIG_TRAIN_SLICE = slice(None, 90000)
BIG_TEST_SLICE = slice(90000, 100000)
SMALL_TRAIN_SLICE = slice(None, 9000)
SMALL_TEST_SLICE = slice(9000, 10000)

In [12]:
X_train = reviewsNormed.stemmed[SMALL_TRAIN_SLICE]
y_train = reviewsNormed.rating[SMALL_TRAIN_SLICE]

X_test = reviewsNormed.stemmed[SMALL_TEST_SLICE]
y_test = reviewsNormed.rating[SMALL_TEST_SLICE]

In [27]:
min_df_range = np.linspace(0.001, 0.01, 3).round(3)
max_df_range = np.linspace(0.3, 0.6, 3).round(2)
ngram_range = [(1, 1), (1, 2), (1, 3)]
stop_words_range = ['english', None]
max_df_range

array([ 0.3 ,  0.45,  0.6 ])

# LinReg

In [14]:
class MyLinReg(LinearRegression):
    def __init__(self, fit_intercept=True, normalize=False, copy_X=True, n_jobs=1):
        super(MyLinReg, self).__init__(fit_intercept, normalize, copy_X, n_jobs)
    def predict(self, X):
        y_pred = super(MyLinReg, self).predict(X).round()
        return map(lambda x: 10 if x > 10 else 1 if x < 1 else x, y_pred)

In [15]:
linreg_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.001, max_df=0.3)),
                          ('tfidf', TfidfTransformer(use_idf=True)),
                          ('clf', MyLinReg()),
                    ])

In [17]:
linreg_text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.3, max_features=None, min_df=0.001,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...      use_idf=True)), ('clf', MyLinReg(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [18]:
y_pred = linreg_text_clf.predict(X_test)

In [19]:
mean_absolute_error(y_pred, y_test)

1.478

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
            reviewsNormed.stemmed, reviewsNormed.rating, test_size=0.5, random_state=123)

In [28]:
X_train.shape


(50000,)

In [None]:
%%time
linreg_text_clf = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MyLinReg()),
])

parameters = {'vect__ngram_range': [(1, 2)],
           'vect__min_df' : min_df_range,
           'vect__max_df' : max_df_range,
           'vect__stop_words' : ['english'],
           'tfidf__use_idf': (True, False),
           'clf__normalize' : (True, False)
}

gs_clf = GridSearchCV(linreg_text_clf, parameters)
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(X_test)



In [None]:
%time
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", accuracy
print "MAE:", mae
print "MSE:", mse
print confusion_matrix(y_pred=y_pred, y_true=y_test)

In [None]:

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))


# KNeighbors

In [12]:
# try:
#     knn_text_clf = Pipeline([('vect', CountVectorizer()),
#                          ('tfidf', TfidfTransformer()),
#                          ('clf', KNeighborsClassifier()),
#     ])

#     parameters = {'vect__ngram_range': ngram_range,
#                    'vect__min_df' : min_df_range,
#                    'vect__max_df' : max_df_range,
#                    'vect__stop_words' : stop_words_range,
#                     'tfidf__use_idf': (True, False),
#                    'clf__n_neighbors': [10, 20, 30, 40]
#      }

#     gs_clf = GridSearchCV(knn_text_clf, parameters, n_jobs=-1)

#     %%time
#     gs_clf = gs_clf.fit(X_train, y_train)

#     y_pred = gs_clf.predict(X_test)
# except:
#     print 'Whoooops :('

In [13]:
# try:
#     best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
#     for param_name in sorted(parameters.keys()):
#         print("%s: %r" % (param_name, best_parameters[param_name]))
# except:
#     print 'Whoops'

In [14]:
 best_nb_parameters = {'vect__ngram_range': (1, 2),
               'vect__min_df' : 0.003,
               'vect__max_df' : 0.6,
               'vect__stop_words' : None,
               'tfidf__use_idf': True,
               'clf__n_neighbors' : 50 
 }

In [22]:
best_nb_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.003, max_df=0.6)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', KNeighborsClassifier(n_neighbors=50)),
])

In [23]:
%%time
best_nb_text_clf.fit(X_train, y_train)

CPU times: user 1min 2s, sys: 1.37 s, total: 1min 3s
Wall time: 1min 16s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.6, max_features=None, min_df=0.003,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...wski',
           metric_params=None, n_jobs=1, n_neighbors=50, p=2,
           weights='uniform'))])

In [24]:
try:
    y_pred = best_nb_text_clf.predict(X_test)
except:
    print 'Whooops'

Whooops


In [None]:
try:
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
except:
    print 'Whoops'

# Naive Bayes

In [None]:
# %%time
# try:
#     nb_text_clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', MultinomialNB()),
#     ])
    
#     parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
#                'vect__min_df' : min_df_range,
#                'vect__max_df' : max_df_range,
#                'vect__stop_words' : stop_words_range,
#                'tfidf__use_idf': (True, False),
#                'clf__alpha' : [0.2, 0.4, 0.6, 0.8, 1.0]
#     }
    
#     gs_clf = GridSearchCV(nb_text_clf, parameters)
#     gs_clf = gs_clf.fit(X_train, y_train)
#     y_pred = gs_clf.predict(X_test)
# except:
#     'Whoooops :('


In [None]:
# try:
#     best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
#     for param_name in sorted(parameters.keys()):
#         print("%s: %r" % (param_name, best_parameters[param_name]))
# except:
#     print 'Whoops'

In [11]:
 best_bayes_parameters = {'vect__ngram_range': (1, 2),
               'vect__min_df' : 0.001,
               'vect__max_df' : 0.3,
               'vect__stop_words' : None,
               'tfidf__use_idf': True,
               'clf__alpha' : 0.2
 }

In [14]:
best_bayes_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.001, max_df=0.3)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', MultinomialNB(alpha=0.2)),
])

In [15]:
%%time
best_bayes_text_clf.fit(X_train, y_train)

CPU times: user 1min 3s, sys: 2.06 s, total: 1min 5s
Wall time: 1min 8s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.3, max_features=None, min_df=0.001,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))])

In [16]:
y_pred = best_bayes_text_clf.predict(X_test)

In [17]:
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", accuracy
print "MAE:", mae
print "MSE:", mse
print confusion_matrix(y_pred=y_pred, y_true=y_test)


accuracy: 0.5165
MAE: 1.1116
MSE: 4.043
[[812  30  27  29  48  38  19  24  10  34]
 [146 356  19  22  27  29  25  15   7  18]
 [116  12 396  28  45  59  45  27  10   9]
 [ 74  16  21 322  44  78  77  40  11  14]
 [ 66  13  13  36 396 109  82  66  17  27]
 [ 48  11  18  20  42 426 153 146  44  50]
 [ 32   9  19  10  48  86 441 237 104  72]
 [ 31   4  10   7  27  62 125 527 232 210]
 [ 34   7   6  10  24  29  66 215 519 379]
 [ 29   1   6   1  17  15  29 123 265 970]]


In [20]:
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

        1.0       0.59      0.76      0.66      1071
        2.0       0.78      0.54      0.63       664
        3.0       0.74      0.53      0.62       747
        4.0       0.66      0.46      0.54       697
        5.0       0.55      0.48      0.51       825
        6.0       0.46      0.44      0.45       958
        7.0       0.42      0.42      0.42      1058
        8.0       0.37      0.43      0.40      1235
        9.0       0.43      0.40      0.41      1289
       10.0       0.54      0.67      0.60      1456

avg / total       0.53      0.52      0.52     10000



# Ненормированные данные

In [31]:
X_train = reviews.stemmed[SMALL_TRAIN_SLICE]
y_train = reviews.rating[SMALL_TRAIN_SLICE]

X_test = reviews.stemmed[SMALL_TEST_SLICE]
y_test = reviews.rating[SMALL_TEST_SLICE]

In [32]:
%%time
best_bayes_text_clf.fit(X_train, y_train)

CPU times: user 11.8 s, sys: 2.58 s, total: 14.4 s
Wall time: 19min 21s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.3, max_features=None, min_df=0.001,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))])

In [33]:
y_pred = best_bayes_text_clf.predict(X_test)

In [34]:
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", accuracy
print "MAE:", mae
print "MSE:", mse
print confusion_matrix(y_pred=y_pred, y_true=y_test)

accuracy: 0.794
MAE: 0.495
MSE: 2.025
[[  0   0   0   0   0   0   1   0   9]
 [  0   0   0   0   0   0   0   0   7]
 [  0   0   0   0   0   0   0   0   3]
 [  1   0   0   0   0   0   1   0   3]
 [  0   0   0   0   0   0   1   0  10]
 [  0   0   0   0   0   0   3   0  22]
 [  0   0   0   0   0   0   4   0  46]
 [  0   0   0   0   0   0   7   2  76]
 [  0   0   0   0   0   0  16   0 788]]


In [35]:
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00        10
        3.0       0.00      0.00      0.00         7
        4.0       0.00      0.00      0.00         3
        5.0       0.00      0.00      0.00         5
        6.0       0.00      0.00      0.00        11
        7.0       0.00      0.00      0.00        25
        8.0       0.12      0.08      0.10        50
        9.0       1.00      0.02      0.05        85
       10.0       0.82      0.98      0.89       804

avg / total       0.75      0.79      0.73      1000



  'precision', 'predicted', average, warn_for)


## Кросс-валидация

In [25]:
gkf = KFold(n_splits=4, random_state=123, shuffle=False)

In [33]:
%%time
for train_index, test_index in gkf.split(reviewsNormed):
    X_train, y_train = reviewsNormed.iloc[train_index].stemmed, reviewsNormed.iloc[train_index].rating
    X_test, y_test = reviewsNormed.iloc[test_index].stemmed, reviewsNormed.iloc[test_index].rating
    
    best_bayes_text_clf.fit(X_train, y_train)
    y_pred = best_bayes_text_clf.predict(X_test)
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)


accuracy: 0.50436
MAE: 1.19072
MSE: 4.41
[[2066   45   89   58   56   80   72   65   32   89]
 [ 529 1264   69   79  104  116   83   82   38   48]
 [ 425   42 1120   63  125  206  132   95   42   35]
 [ 287   49   83 1099  140  301  240  146   56   66]
 [ 213   26   55   58 1190  271  335  200   85   74]
 [ 140   23   37   37  118 1065  453  368  102  117]
 [ 116   12   22   24   86  166 1010  613  283  248]
 [  83   16   16   12   67   69  298 1049  464  445]
 [  52    7    5   10   26   32  133  474 1034  776]
 [  65    5    6    4   22   27   51  247  430 1712]]
accuracy: 0.50716
MAE: 1.16696
MSE: 4.29216
[[1971   49   79   51   80   71   60   57   46   75]
 [ 452 1179   99   45  102  113  115   50   36   53]
 [ 408   39 1191   76  116  142  139   88   51   40]
 [ 281   49   68  983  129  230  222  139   37   56]
 [ 197   32   62   48 1087  262  330  177   62   67]
 [ 132   26   48   58  112 1075  454  315  136  113]
 [  92   11   26   20   98  213 1113  567  269  235]
 [  79   10  

# Bayes

In [26]:
%%time
try:
    bayes_text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
    ])
    
    parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
               'vect__min_df' : min_df_range,
               'vect__max_df' : max_df_range,
               'vect__stop_words' : stop_words_range,
               'tfidf__use_idf': (True, False),
               'clf__alpha' : [0.1, 0.2, 0.4],
                'clf__fit_prior' : [True, False]
    }
    
    gs_clf = GridSearchCV(bayes_text_clf, parameters)
    gs_clf = gs_clf.fit(X_train, y_train)
    y_pred = gs_clf.predict(X_test)
except:
    'Whoooops :('

CPU times: user 10h 22min 21s, sys: 3min 55s, total: 10h 26min 16s
Wall time: 10h 26min 21s


In [27]:
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", accuracy
print "MAE:", mae
print "MSE:", mse
print confusion_matrix(y_pred=y_pred, y_true=y_test)
print classification_report(y_test, y_pred)

accuracy: 0.365
MAE: 1.407
MSE: 4.601
[[52 10 16  5  8  8  5  2  1  2]
 [15 29  6  8  6  5  2  0  3  3]
 [15  6 33  5  7  6  8  2  0  2]
 [11  6 10 32 13 14 13  2  1  0]
 [ 4  5  5 11 41 16 13  4  0  2]
 [ 3  5  8  5  6 26 29 14  3  2]
 [ 1  2  5  7  8 16 35 26  5  9]
 [ 3  0  1  1  6  4 24 34 12 16]
 [ 1  1  3  3  2  4 14 17 27 26]
 [ 0  0  1  1  2  1 12 18 22 56]]
             precision    recall  f1-score   support

        1.0       0.50      0.48      0.49       109
        2.0       0.45      0.38      0.41        77
        3.0       0.38      0.39      0.38        84
        4.0       0.41      0.31      0.36       102
        5.0       0.41      0.41      0.41       101
        6.0       0.26      0.26      0.26       101
        7.0       0.23      0.31      0.26       114
        8.0       0.29      0.34      0.31       101
        9.0       0.36      0.28      0.31        98
       10.0       0.47      0.50      0.48       113

avg / total       0.37      0.36      0.37    

In [28]:
try:
    best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
except:
    print 'Whoops'

clf__alpha: 0.2
clf__fit_prior: False
tfidf__use_idf: True
vect__max_df: 0.29999999999999999
vect__min_df: 0.001
vect__ngram_range: (1, 2)
vect__stop_words: None


# RandomForest


In [None]:
# %%time
# try:
#     forest_text_clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', RandomForestClassifier()),
#     ])
    
#     parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
#                'vect__min_df' : min_df_range,
#                'vect__max_df' : max_df_range,
#                'vect__stop_words' : stop_words_range,
#                'tfidf__use_idf': (True, False),
#                'clf__n_estimators' : [10, 20, 30]
#     }
    
#     gs_clf = GridSearchCV(forest_text_clf, parameters)
#     gs_clf = gs_clf.fit(X_train, y_train)
#     y_pred = gs_clf.predict(X_test)
# except:
#     'Whoooops :('

In [None]:
# try:
#     best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
#     for param_name in sorted(parameters.keys()):
#         print("%s: %r" % (param_name, best_parameters[param_name]))
# except:
#     print 'Whoops'

In [None]:
 best_forest_parameters = {'vect__ngram_range': (1, 2),
               'vect__min_df' : 0.003,
               'vect__max_df' : 0.3,
               'vect__stop_words' :'english',
               'tfidf__use_idf': False,
               'clf__n_estimators' : 30
 }

In [25]:
best_forest_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.003, max_df=0.3, stop_words='english')),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', RandomForestClassifier(n_estimators=30)),
])

In [26]:
%%time
try:
    best_forest_text_clf.fit(X_train, y_train)


    y_pred = best_forest_text_clf.predict(X_test)
except:
    print 'Whoooops'

CPU times: user 4min 52s, sys: 1.47 s, total: 4min 53s
Wall time: 5min 7s


In [27]:
try:
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
except:
    print 'Whoops'

accuracy: 0.5415
MAE: 1.2248
MSE: 5.0344
[[833  18  21  21  27  23  32  21  18  48]
 [142 392  10  15  19  17  19  16  13  20]
 [120  21 388  27  28  35  29  30  25  42]
 [ 85  18  17 374  32  27  54  34  26  38]
 [ 96  11  16  18 450  48  71  58  51  68]
 [ 65  12  22  27  37 456 111 100  64  90]
 [ 47  15  11  23  26  68 447 161 121 140]
 [ 43   8  10  15  19  55 122 535 157 240]
 [ 34   8   5   8  14  26 106 135 585 354]
 [ 39   3  10  11   6  17  63 120 192 955]]


# SVM

In [None]:
svm_text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),
])

In [None]:
 parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'vect__min_df' : min_df_range,
               'vect__max_df' : max_df_range,
               'vect__stop_words' : stop_words_range,
               'tfidf__use_idf': (True, False),
               'clf__loss' : ['hinge', 'log'], 
               'clf__penalty' : ['l2', 'l1'],
               'clf__random_state' : [123]
 }

In [None]:
# gs_clf = GridSearchCV(svm_text_clf, parameters)

In [None]:
# %%time

# gs_clf = gs_clf.fit(X_train, y_train)

# y_pred = gs_clf.predict(X_test)

# best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
# for param_name in sorted(parameters.keys()):
#     print("%s: %r" % (param_name, best_parameters[param_name]))

# best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
# for param_name in sorted(parameters.keys()):
#     print("%s: %r" % (param_name, best_parameters[param_name]))


In [28]:
 best_svm_parameters = {'vect__ngram_range': (1, 2),
               'vect__min_df' : 0.01,
               'vect__max_df' : 0.3,
               'vect__stop_words' : None,
               'tfidf__use_idf': True,
               'clf__loss' : 'log', 
               'clf__penalty' : 'l2',
               'clf__random_state' : [123]
}

In [29]:
  
best_svm_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.01, max_df=0.3)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SGDClassifier(loss='log', penalty='l2', random_state=123)),
])



In [30]:
%%time
try:
    best_svm_text_clf.fit(X_train, y_train)

    y_pred = best_svm_text_clf.predict(X_test)
except:
    print 'Whoops'

CPU times: user 1min 13s, sys: 2.42 s, total: 1min 15s
Wall time: 3min 57s


In [31]:
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[769,  48,  39,  30,  35,  31,  27,  19,  12,  52],
       [225, 232,  44,  25,  34,  38,  26,  15,   7,  17],
       [206,  54, 223,  49,  59,  42,  35,  30,  19,  28],
       [138,  27,  42, 173,  80,  84,  62,  39,  20,  40],
       [116,  38,  39,  38, 268, 124, 122,  69,  30,  43],
       [ 76,  27,  34,  30,  82, 322, 195, 122,  42,  54],
       [ 57,  11,  10,  19,  45, 127, 366, 203, 124,  97],
       [ 46,   6,   9,  14,  30,  64, 168, 414, 198, 255],
       [ 30,   4,  10,   2,  18,  27,  85, 231, 358, 510],
       [ 27,   4,   8,   3,   7,  13,  28, 109, 225, 992]])

In [32]:
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", accuracy
print "MAE:", mae
print "MSE:", mse
print confusion_matrix(y_pred=y_pred, y_true=y_test)

accuracy: 0.4117
MAE: 1.3635
MSE: 5.0123
[[769  48  39  30  35  31  27  19  12  52]
 [225 232  44  25  34  38  26  15   7  17]
 [206  54 223  49  59  42  35  30  19  28]
 [138  27  42 173  80  84  62  39  20  40]
 [116  38  39  38 268 124 122  69  30  43]
 [ 76  27  34  30  82 322 195 122  42  54]
 [ 57  11  10  19  45 127 366 203 124  97]
 [ 46   6   9  14  30  64 168 414 198 255]
 [ 30   4  10   2  18  27  85 231 358 510]
 [ 27   4   8   3   7  13  28 109 225 992]]
