In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
import numpy as np



In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

# Plotting config
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
from sklearn.pipeline import Pipeline

# Данные

In [6]:
reviews = pd.read_csv('stemmed.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
S = np.argsort(reviews.rating.value_counts().index)
ratingCounts = reviews.rating.value_counts().values[S]
proba = map(lambda x: 1.0/x, ratingCounts)
print proba
row_proba = map(lambda x: proba[int(x)-1], reviews.rating)
row_proba /= sum(row_proba)
idx = (np.random.choice(reviews.index, size=100000, replace=False, p=row_proba))
reviewsNormed = reviews.loc[idx,:]

[3.0507337014552e-05, 7.265329845975008e-05, 6.415191172696946e-05, 6.89464975179261e-05, 5.0461724781753044e-05, 4.39734400422145e-05, 3.294458720432233e-05, 2.1918769041930604e-05, 1.607665348381081e-05, 6.89731280693042e-06]


In [8]:
reviewsNormed.shape

(100000, 15)

## учим на нормированной, тестим на обычной

In [7]:
from sklearn.model_selection import KFold

In [8]:
def norm_idx(y_train):
    S = np.argsort(y_train.value_counts().index)
    ratingCounts = y_train.value_counts().values[S]
    proba = map(lambda x: 1.0/x, ratingCounts)
    row_proba = map(lambda x: proba[int(x)-1], y_train)
    row_proba /= sum(row_proba)
    idx = (np.random.choice(y_train.index, size=100000, replace=False, p=row_proba))
    return idx   

### LinReg

In [33]:
%%time

linreg_accuracy_list = []
linreg_mae_list = []
linreg_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = norm_idx(y_train)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    linreg_text_clf_best_mae = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.4, min_df=0.003)),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LinearRegression()),
                    ])
    linreg_text_clf_best_mae.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = linreg_text_clf_best_mae.predict(X_test).round()
    y_pred = map(lambda x: 10 if x > 10 else 1 if x < 1 else x, y_pred)
    print 'predict done'
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
    
    linreg_accuracy_list += [accuracy]
    linreg_mae_list += [mae]
    linreg_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
accuracy: 0.276203163368
MAE: 1.35378619929
MSE: 3.39691218722
[[ 1752  2079  1828  1203   699   380   152    43     9     6]
 [  362   821  1090   602   314   166    64    19     4     0]
 [  153   415  1179  1131   547   289   107    29     6     2]
 [  130   173   563  1279   827   384   163    65    13     7]
 [   60   160   360  1160  1650   952   466   139    40    11]
 [   30    92   204   538  1346  1734   984   476   135    65]
 [    2    26    83   337   995  1721  2160  1432   589   250]
 [    2    11    55   176   629  1504  2902  3055  1957  1090]
 [    6     6    26   134   472  1233  2709  4346  3868  2890]
 [    6    12    50   229   788  2273  5100  8183  9340 10285]]
             precision    recall  f1-score   support

        1.0       0.70      0.21      0.33      8151
        2.0       0.22      0.24      0.23      3442
        3.0       0.22      0.31      0.25      3858
        4.0       0.19      0.35      0.25      3604
        5.0       

In [34]:
print 'mean accuracy', np.mean(linreg_accuracy_list)
print 'mean MAE', np.mean(linreg_mae_list)
print 'mean MSE', np.mean(linreg_mse_list)

mean accuracy 0.280552545507
mean MAE 1.3439019177
mean MSE 3.36363568581


### Bayes

In [35]:
%%time

bayes_accuracy_list = []
bayes_mae_list = []
bayes_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = norm_idx(y_train)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    bayes_text_clf_best_mae = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.3, min_df=0.001)),
                          ('tfidf', TfidfTransformer(use_idf=True)),
                          ('clf', MultinomialNB(alpha=0.2))
                    ])
    bayes_text_clf_best_mae.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = bayes_text_clf_best_mae.predict(X_test)
    print 'predict done'
    print y_pred
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
                           
    bayes_accuracy_list += [accuracy]
    bayes_mae_list += [mae]
    bayes_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
[ 10.  10.  10. ...,   8.   1.   1.]
accuracy: 0.555468291762
MAE: 0.982393701101
MSE: 3.73434470966
[[ 6542   166   229   169   231   211   206   187    99   229]
 [  774  1766   122    96   133   171   142   101    44    73]
 [  698    59  1924   135   203   299   227   185    53    75]
 [  462    52    84  1705   234   392   347   255    76    84]
 [  454    68   101   102  2376   536   660   476   124   172]
 [  301    42    62    95   277  2491  1066   809   217   249]
 [  291    32    65    72   279   527  3147  1914   725   578]
 [  358    55    96    77   292   366  1174  5033  2004  1946]
 [  339    44    56    49   204   260   687  2744  6106  4864]
 [  865    59    79    95   297   258   616  3179  6055 24784]]
             precision    recall  f1-score   support

        1.0       0.59      0.79      0.68      8269
        2.0       0.75      0.52      0.61      3422
        3.0       0.68      0.50      0.58      3858
        4.0       0.66      0.46 

In [36]:
print 'mean accuracy', np.mean(bayes_accuracy_list)
print 'mean MAE', np.mean(bayes_mae_list)
print 'mean MSE', np.mean(bayes_mse_list)

mean accuracy 0.55507809005
mean MAE 0.976856316297
mean MSE 3.68161528597


### LogReg

In [37]:
%%time

logreg_accuracy_list = []
logreg_mae_list = []
logreg_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = norm_idx(y_train)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    logreg_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2')),
                    ])
    logreg_text_clf.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = logreg_text_clf.predict(X_test)
    print 'predict done'
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
                           
    logreg_accuracy_list += [accuracy]
    logreg_mae_list += [mae]
    logreg_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
accuracy: 0.555667120659
MAE: 0.944646034855
MSE: 3.37477258945
[[ 5986   549   519   356   265   185   102    61    60   162]
 [  551  1937   265   222   154   107    56    34    18    38]
 [  482   260  2200   314   259   200    95    48    32    48]
 [  239   193   258  1886   279   313   195    86    26    59]
 [  293   207   280   376  2454   580   398   158    88    94]
 [  177   141   221   341   400  2762   846   441   183   163]
 [  166   138   160   264   392   907  3391  1298   591   423]
 [  177   134   135   153   332   639  1771  4686  1825  1539]
 [  221   133   126   145   201   428  1199  2579  6102  4431]
 [  653   236   230   203   292   429  1177  3030  5461 24490]]
             precision    recall  f1-score   support

        1.0       0.67      0.73      0.70      8245
        2.0       0.49      0.57      0.53      3382
        3.0       0.50      0.56      0.53      3938
        4.0       0.44      0.53      0.48      3534
        5.0      

In [38]:
print 'mean accuracy', np.mean(logreg_accuracy_list)
print 'mean MAE', np.mean(logreg_mae_list)
print 'mean MSE', np.mean(logreg_mse_list)

mean accuracy 0.558309059639
mean MAE 0.93764228693
mean MSE 3.3368907137


## учим на обычной, тестим на обычной

### LinReg

In [39]:
%%time

linreg2_accuracy_list = []
linreg2_mae_list = []
linreg2_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = np.random.choice(y_train.index, size=100000, replace=False)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    linreg2_text_clf_best_mae = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.4, min_df=0.003)),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LinearRegression()),
                    ])
    linreg2_text_clf_best_mae.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = linreg2_text_clf_best_mae.predict(X_test).round()
    y_pred = map(lambda x: 10 if x > 10 else 1 if x < 1 else x, y_pred)
    print 'predict done'
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
    
    linreg2_accuracy_list += [accuracy]
    linreg2_mae_list += [mae]
    linreg2_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
accuracy: 0.321864219746
MAE: 1.19596576166
MSE: 2.87674596626
[[ 1531  1490  1703  1385   944   591   298   117    32    16]
 [  374   531   758   815   498   286   131    58    13     6]
 [  229   362   777  1018   730   412   247    67    26     5]
 [  114   215   560   801   840   563   341   131    35     7]
 [   67   113   371   868  1222  1155   647   283    85    27]
 [   24    66   167   488  1194  1688  1139   694   248    98]
 [    6    16    81   255   622  1488  2111  1636   979   405]
 [    3     4    33   101   348  1006  2289  3308  2617  1723]
 [    0     3     9    47   189   615  1888  3882  4883  3959]
 [    5     6    11    44   220   765  2624  6207 10976 15524]]
             precision    recall  f1-score   support

        1.0       0.65      0.19      0.29      8107
        2.0       0.19      0.15      0.17      3470
        3.0       0.17      0.20      0.19      3873
        4.0       0.14      0.22      0.17      3607
        5.0       

In [40]:
print 'mean accuracy', np.mean(linreg2_accuracy_list)
print 'mean MAE', np.mean(linreg2_mae_list)
print 'mean MSE', np.mean(linreg2_mse_list)

mean accuracy 0.321752378491
mean MAE 1.1931498474
mean MSE 2.85838163219


### Bayes

In [41]:
%%time

bayes2_accuracy_list = []
bayes2_mae_list = []
bayes2_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = np.random.choice(y_train.index, size=100000, replace=False)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    bayes2_text_clf_best_mae = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.3, min_df=0.001)),
                          ('tfidf', TfidfTransformer(use_idf=True)),
                          ('clf', MultinomialNB(alpha=0.2))
                    ])
    bayes2_text_clf_best_mae.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = bayes2_text_clf_best_mae.predict(X_test)
    print 'predict done'
    print y_pred
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
                           
    bayes2_accuracy_list += [accuracy]
    bayes2_mae_list += [mae]
    bayes2_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
[ 10.  10.  10. ...,   9.  10.  10.]
accuracy: 0.554513913052
MAE: 1.1301335136
MSE: 4.77293739872
[[ 6393    16    41    39    73    83   107   191    71  1219]
 [ 1116  1242    39    22    69    73    94   103    43   521]
 [ 1068    14  1421    31   113   140   184   189    81   750]
 [  739    25    31  1141   133   235   283   265    92   689]
 [  644     7    24    34  1771   323   499   491   160  1050]
 [  455     3    22    18   135  1790   695   838   324  1470]
 [  343     1    20     8   106   198  1957  1403   676  2825]
 [  241     3     7     6    80    83   399  3237  1412  5979]
 [  123     2     1     2    28    55   146  1171  3632 10374]
 [  267     0     0     0    30    21   106   876  1645 33194]]
             precision    recall  f1-score   support

        1.0       0.56      0.78      0.65      8233
        2.0       0.95      0.37      0.54      3322
        3.0       0.88      0.36      0.51      3991
        4.0       0.88      0.31   

In [42]:
print 'mean accuracy', np.mean(bayes2_accuracy_list)
print 'mean MAE', np.mean(bayes2_mae_list)
print 'mean MSE', np.mean(bayes2_mse_list)

mean accuracy 0.555090516856
mean MAE 1.13093131456
mean MSE 4.78166350197


### LogReg

In [43]:
%%time

logreg2_accuracy_list = []
logreg2_mae_list = []
logreg2_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = np.random.choice(y_train.index, size=100000, replace=False)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    logreg2_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2')),
                    ])
    logreg2_text_clf.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = logreg2_text_clf.predict(X_test)
    print 'predict done'
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
                           
    logreg2_accuracy_list += [accuracy]
    logreg2_mae_list += [mae]
    logreg2_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
accuracy: 0.574993289525
MAE: 0.915070236308
MSE: 3.24486772908
[[ 5986   459   412   369   275   198   112    47    40   249]
 [  714  1807   236   223   177   101    58    33    17    73]
 [  633   211  1935   308   277   212   109    60    25    60]
 [  348   189   275  1620   347   335   201    78    22   103]
 [  356   179   312   383  2356   563   415   201    80   166]
 [  201   132   256   325   487  2606   844   458   144   245]
 [  205    96   141   204   429   846  3305  1145   549   673]
 [  164   110   131   183   279   600  1606  4310  1534  2355]
 [  204    98   153   158   226   412   994  2022  5197  6178]
 [  434   208   190   210   278   362   855  2083  3103 28716]]
             precision    recall  f1-score   support

        1.0       0.65      0.73      0.69      8147
        2.0       0.52      0.53      0.52      3439
        3.0       0.48      0.51      0.49      3830
        4.0       0.41      0.46      0.43      3518
        5.0      

In [44]:
print 'mean accuracy', np.mean(logreg2_accuracy_list)
print 'mean MAE', np.mean(logreg2_mae_list)
print 'mean MSE', np.mean(logreg2_mse_list)

mean accuracy 0.574655280398
mean MAE 0.922247959518
mean MSE 3.29184105618


## Сравним

In [46]:
print 'LinearRegression'
print '  normed train'
print '    mean accuracy', np.mean(linreg_accuracy_list)
print '    mean MAE', np.mean(linreg_mae_list)
print '    mean MSE', np.mean(linreg_mse_list)
print '  not normed train'
print '    mean accuracy', np.mean(linreg2_accuracy_list)
print '    mean MAE', np.mean(linreg2_mae_list)
print '    mean MSE', np.mean(linreg2_mse_list)

print 'MultinomialNB'
print '  normed train'
print '    mean accuracy', np.mean(bayes_accuracy_list)
print '    mean MAE', np.mean(bayes_mae_list)
print '    mean MSE', np.mean(bayes_mse_list)
print '  not normed train'
print '    mean accuracy', np.mean(bayes2_accuracy_list)
print '    mean MAE', np.mean(bayes2_mae_list)
print '    mean MSE', np.mean(bayes2_mse_list)

print 'LogisticRegression'
print '  normed train'
print '    mean accuracy', np.mean(logreg_accuracy_list)
print '    mean MAE', np.mean(logreg_mae_list)
print '    mean MSE', np.mean(logreg_mse_list)
print '  not normed train'
print '    mean accuracy', np.mean(logreg2_accuracy_list)
print '    mean MAE', np.mean(logreg2_mae_list)
print '    mean MSE', np.mean(logreg2_mse_list)

 LinearRegression
  normed train
    mean accuracy 0.280552545507
    mean MAE 1.3439019177
    mean MSE 3.36363568581
  not normed train
    mean accuracy 0.321752378491
    mean MAE 1.1931498474
    mean MSE 2.85838163219
MultinomialNB
  normed train
    mean accuracy 0.55507809005
    mean MAE 0.976856316297
    mean MSE 3.68161528597
  not normed train
    mean accuracy 0.555090516856
    mean MAE 1.13093131456
    mean MSE 4.78166350197
LogisticRegression
  normed train
    mean accuracy 0.558309059639
    mean MAE 0.93764228693
    mean MSE 3.3368907137
  not normed train
    mean accuracy 0.574655280398
    mean MAE 0.922247959518
    mean MSE 3.29184105618


# LinReg

In [None]:
linreg_best_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.4, min_df=0.003)),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LinearRegression()),
                    ])

# Bayes

In [None]:
bayes_best_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.001, max_df=0.3)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
        

# LogReg

In [146]:
logreg_best_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2')),
                    ])