# Sentiment Analysis

Download the Foursquare annotated comments in Brazilian Portuguese: https://www.kaggle.com/thaisalmeida/tips-foursquare/version/1

Place the files in subfolder 'docs/'

In [1]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
pd.set_option('max_colwidth',150)

In [2]:
df = pd.read_csv('docs/tips_scenario1_train.csv')
df.head(16)

Unnamed: 0,texto,rotulo
0,"A comida é deliciosa, mas pedi limonada suiça e me disseram que hoje estavam todos muito ocupados e que ninguém conseguiria me atender....melhor i...",-1.0
1,"A partir desta sexta feira dia 11 começam a abrir para jantar mas corre pois é só até as 22 hrs e no domingo dia das mães, estarão aberto durante ...",0.0
2,Joint burguer e brewdog,0.0
3,Agora de segunda a sexta o Habanero vai abrir no almoço com pratos mexicanos e tradicionais!,0.0
4,"Experimente o drink ""Dona Diabla"". Muito bom!",1.0
5,Nova senha do Wifi: 1129508219,0.0
6,Wi-fi 1129508219,0.0
7,"Adoramos a pizza carbonara e a paulistana. Não surpreendeu tanto, mas vale a pena por resgatar o tradicionalismo. Dica @Gourmet_For",1.0
8,"O diferencial desse Burger King é que você mesmo serve o refrigerante, e a vontade!",1.0
9,Unico defeito estacionamento pago!,-1.0


In [3]:
import re
def splitWithPunctuation(text):
    return re.findall(r"[\w']+|[.,!?;:\"]", text)    

In [4]:
splitWithPunctuation('mas que: "legal"')

['mas', 'que', ':', '"', 'legal', '"']

In [5]:
df.shape

(1714, 2)

# Baseline: Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
df = df.fillna(0)

In [7]:
texts = df['texto'].astype(str).tolist()
categs = df['rotulo'].tolist()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(texts, categs, test_size=0.1, random_state=42)

In [9]:
countVec = CountVectorizer()
vectTexts_train = countVec.fit_transform(X_train)
vectTexts_test = countVec.transform(X_test)

In [10]:
vectTexts_train

<1542x4781 sparse matrix of type '<class 'numpy.int64'>'
	with 25423 stored elements in Compressed Sparse Row format>

In [11]:
mnb = MultinomialNB()
mnb.fit(vectTexts_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
mnb.score(vectTexts_train, y_train)

0.9137483787289234

In [13]:
mnb.score(vectTexts_test, y_test)

0.7790697674418605

In [14]:
mnbParams = { #'verbose' : [1],
             'alpha':[0.001, 0.1,1,10, 100],  
             'fit_prior' :[True, False]}
mnbRSCV = RandomizedSearchCV(mnb, mnbParams, verbose=1, return_train_score=True) #, n_jobs=-1)
mnbRSCV.fit(vectTexts_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'alpha': [0.001, 0.1, 1, 10, 100], 'fit_prior': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [15]:
pd.DataFrame(mnbRSCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_fit_prior,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.005208,0.007365134,0.005208,0.007365471,True,0.001,"{'fit_prior': True, 'alpha': 0.001}",0.753398,0.782524,0.748047,0.761349,0.015153,3,0.988315,0.986368,0.98932,0.988001,0.001226
1,0.0,0.0,0.0,0.0,False,0.001,"{'fit_prior': False, 'alpha': 0.001}",0.741748,0.768932,0.753906,0.754864,0.011129,4,0.991237,0.986368,0.98835,0.988651,0.001999
2,0.006229,0.007440478,0.000333,0.0004713704,True,0.1,"{'fit_prior': True, 'alpha': 0.1}",0.737864,0.765049,0.697266,0.733463,0.027834,5,0.982473,0.979552,0.986408,0.982811,0.002809
3,0.001999,1.123916e-07,0.001,2.247832e-07,False,0.1,"{'fit_prior': False, 'alpha': 0.1}",0.704854,0.728155,0.667969,0.700389,0.024762,7,0.978578,0.977605,0.983495,0.979893,0.002578
4,0.002045,6.479383e-05,0.000333,0.0004713704,True,1.0,"{'fit_prior': True, 'alpha': 1}",0.790291,0.8,0.791016,0.793774,0.004419,1,0.91334,0.910419,0.913592,0.91245,0.00144
5,0.0,0.0,0.0,0.0,False,1.0,"{'fit_prior': False, 'alpha': 1}",0.786408,0.803883,0.775391,0.788586,0.011728,2,0.926972,0.927945,0.93301,0.929309,0.002647
6,0.0,0.0,0.0,0.0,True,10.0,"{'fit_prior': True, 'alpha': 10}",0.68932,0.685437,0.693359,0.689364,0.003233,8,0.702045,0.696203,0.693204,0.69715,0.003671
7,0.0,0.0,0.0,0.0,False,10.0,"{'fit_prior': False, 'alpha': 10}",0.704854,0.700971,0.708984,0.704929,0.00327,6,0.741967,0.720545,0.72233,0.728281,0.009705
8,0.005208,0.007365358,0.0,0.0,True,100.0,"{'fit_prior': True, 'alpha': 100}",0.681553,0.681553,0.683594,0.682231,0.000961,10,0.682571,0.682571,0.681553,0.682232,0.00048
9,0.0,0.0,0.0,0.0,False,100.0,"{'fit_prior': False, 'alpha': 100}",0.68932,0.683495,0.689453,0.687419,0.002779,9,0.694255,0.691334,0.691262,0.692284,0.001394


In [16]:
mnbRSCV.best_estimator_.score(vectTexts_test, y_test)

0.7790697674418605

# Word Embedding Class

In [17]:
from Embeddings import WordEmbeddingBR, splitWithPunctuation
import numpy as np

In [18]:
WordEmbeddingBR.downloadNILCEmbeddings()
WordEmbeddingBR.getAvailableEmbeddings()

Downloading NILC word embeddings. More available at http://nilc.icmc.usp.br/embeddings
glove50 exists. Skipping.
cbow50_wang2vec exists. Skipping.
cbow50_fasttext exists. Skipping.
skip50_word2vec exists. Skipping.
Done!


['cbow50_fasttext',
 'cbow50_wang2vec',
 'glove50',
 'skip50_word2vec',
 'skip_s300_word2vec']

In [19]:
wee = WordEmbeddingBR('skip_s300_word2vec')

Reading embedding file: skip_s300_word2vec.zip


934967it [02:38, 5914.47it/s]


In [20]:
maxlen = max([len(x) for x in X_train])
def getSentenceVector(x, emb):
    wordArray = splitWithPunctuation(x)
    ans = np.zeros( (maxlen, emb.embDim) )
    for i,w in enumerate(wordArray):
        if len(w) > 2:
            ans[i] = emb.encodeWord(w.lower())
            ans[i] /= (np.linalg.norm(ans[i])+1e-4)
        
    return np.sum(ans,axis=0)

In [21]:
X_train[2]

'Muito caro pelo tamanho e sabor. Fomos em dois e gastamos R$60,00. Para fast food de tamanho normal, foi muito.'

In [22]:
getSentenceVector(X_train[2], wee).shape

(300,)

In [23]:
X_train[0]

'Achei a comida bem medíocre. Prato com muitas coisas, mas nada com sabor. Não vale o que custa.'

In [24]:
vectTexts_train = [getSentenceVector(x, wee).reshape((-1,)) for x in X_train]
vectTexts_test = [getSentenceVector(x, wee).reshape((-1,)) for x in X_test]
getSentenceVector(X_train[0], wee)

array([-0.01614947, -0.10834916, -0.05412925, -0.68149838, -0.34972678,
        0.07611305,  0.21173427, -0.16032821, -0.09424525, -0.49070412,
       -0.98033918, -0.55583156, -0.36792939, -0.25768241, -0.14422328,
       -0.18532498, -0.37150617,  0.33750424,  0.80553468,  0.57310063,
        1.1218442 , -0.23137208, -0.43360203, -0.37138272,  0.15888166,
        0.00609828,  0.65231598,  0.67753354, -0.08889965,  0.25209472,
       -0.82058552,  0.40230038, -0.24429756,  0.08174712, -0.27499635,
       -0.04251375, -0.10124676, -0.39328442, -0.13845402, -0.00286722,
        0.08634738,  0.31591046,  0.24507671,  0.57419708,  0.1381439 ,
       -0.28199261,  0.69349179,  0.0255523 ,  0.1906665 ,  0.04768168,
       -0.43295604, -0.11906724, -0.18540661,  0.11029822, -0.25131733,
        0.35577184, -0.13817987,  0.19460735,  0.20334827,  0.08174238,
        0.35756133,  1.24393274, -0.03804483,  0.85921946, -0.11411306,
       -0.2901375 ,  0.20000887, -1.02227981,  0.14509503,  0.21

In [25]:
#try some of scikit learn classifiers
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

gbc = GradientBoostingClassifier(verbose=1, learning_rate=0.1, n_estimators=320, max_depth=6)
gbc.fit(vectTexts_train, y_train)

      Iter       Train Loss   Remaining Time 
         1        1292.7515            1.24m
         2        1160.9407            1.30m
         3        1047.6714            1.29m
         4         955.1006            1.29m
         5         866.7839            1.29m
         6         794.9990            1.29m
         7         732.1851            1.30m
         8         675.2564            1.30m
         9         623.0586            1.30m
        10         575.3102            1.29m
        20         294.0879            1.23m
        30         175.8658            1.17m
        40         110.3201            1.13m
        50          71.8176            1.09m
        60          47.4958            1.05m
        70          33.5867            1.00m
        80          23.5397           57.74s
        90          17.4062           55.17s
       100          13.1753           52.60s
       200           4.6168           27.14s
       300           4.3792            3.77s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=320,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

In [26]:
print('Train score: {} Test score: {}'.format(gbc.score(vectTexts_train, y_train),gbc.score(vectTexts_test, y_test)))

Train score: 0.9980544747081712 Test score: 0.7093023255813954


In [27]:
gbParams = { #'verbose' : [1],
             'learning_rate':[0.1,0.05,0.15,0.25],  
             'n_estimators' :[50, 100, 200, 400], 
             'max_depth'    :[3,4,5,6,7,8]}
gbRSCV = RandomizedSearchCV(gbc, gbParams, verbose=1, return_train_score=True, n_iter=12) #, n_jobs=-1)
gbRSCV.fit(vectTexts_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
      Iter       Train Loss   Remaining Time 
         1         828.9811            5.66s
         2         730.0465            5.58s
         3         649.2909            5.72s
         4         579.4302            5.54s
         5         512.3739            5.52s
         6         453.0767            5.51s
         7         409.0992            5.41s
         8         369.8419            5.24s
         9         337.4782            5.17s
        10         306.7292            5.01s
        20         139.7330            3.76s
        30          70.8567            2.50s
        40          38.8864            1.24s
        50          22.4526            0.00s
      Iter       Train Loss   Remaining Time 
         1         827.8430            5.86s
         2         721.7885            5.67s
         3         634.7844            5.81s
         4         562.0605            5.62s
         5         505.3403          

        30         169.9454            1.49m
        40          98.4174            1.47m
        50          59.2979            1.43m
        60          36.7638            1.40m
        70          22.9115            1.36m
        80          14.7844            1.32m
        90           9.4897            1.27m
       100           6.0925            1.23m
       200           0.3004           40.46s
       300           0.2917           13.62s
       400           0.2917            0.00s
      Iter       Train Loss   Remaining Time 
         1         891.2424            1.35m
         2         827.4574            1.45m
         3         773.2979            1.47m
         4         720.9781            1.49m
         5         671.4364            1.51m
         6         628.3645            1.51m
         7         588.4769            1.51m
         8         551.9916            1.52m
         9         517.2990            1.54m
        10         486.1422            1.54m
        2

        10         510.8394            2.80s
        20         357.5080            2.04s
        30         271.9804            1.34s
        40         213.0237            0.66s
        50         173.9547            0.00s
      Iter       Train Loss   Remaining Time 
         1         876.7533            3.06s
         2         805.9608            3.37s
         3         747.4050            3.18s
         4         698.7659            3.22s
         5         657.7161            3.00s
         6         620.7328            3.06s
         7         588.8819            2.97s
         8         560.6086            2.88s
         9         532.9693            2.85s
        10         509.4466            2.73s
        20         358.3749            2.05s
        30         277.1252            1.36s
        40         221.6860            0.68s
        50         180.7958            0.00s
      Iter       Train Loss   Remaining Time 
         1         871.0254            3.33s
        

         8         538.3072            3.98s
         9         508.9305            3.88s
        10         483.1873            3.79s
        20         306.7554            2.83s
        30         216.6812            1.87s
        40         161.5602            0.92s
        50         122.1975            0.00s
      Iter       Train Loss   Remaining Time 
         1         791.3794            8.42s
         2         667.0974            8.96s
         3         568.0344            8.60s
         4         487.8983            8.65s
         5         413.9688            8.61s
         6         354.7553            8.46s
         7         307.3574            8.27s
         8         262.2731            8.14s
         9         228.1565            7.99s
        10         198.9460            7.76s
        20          53.2628            5.93s
        30          15.6800            3.98s
        40           5.3888            1.97s
        50           1.9565            0.00s
      Ite

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  9.9min finished


      Iter       Train Loss   Remaining Time 
         1        1258.4575            9.19s
         2        1106.8200            9.00s
         3         984.9409            8.81s
         4         885.6764            8.62s
         5         798.6325            8.44s
         6         724.7979            8.20s
         7         666.5253            8.02s
         8         610.7853            7.90s
         9         562.4058            7.63s
        10         518.3656            7.52s
        20         273.3905            5.59s
        30         166.9851            3.70s
        40         108.6725            1.84s
        50          71.5063            0.00s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=320,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False),
          fit_params=None, iid=True, n_iter=12, n_jobs=1,
          param_distributions={'learning_rate': [0.1, 0.05, 0.15, 0.25], 'n_estimators': [50, 100, 200, 400], 'max_depth': [3, 4, 5, 6, 7, 8]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [28]:
pd.DataFrame(gbRSCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,6.186692,0.011586,0.013457,0.006113,50,5,0.15,"{'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.15}",0.768932,0.776699,0.769531,0.771725,0.003531,1,1.0,0.998053,0.998058,0.998704,0.000917
1,15.968433,0.064983,0.012676,0.007301,100,6,0.1,"{'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}",0.76699,0.761165,0.742188,0.756809,0.01058,12,1.0,0.998053,0.998058,0.998704,0.000917
2,13.281129,0.095497,0.008713,0.001767,200,3,0.1,"{'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}",0.770874,0.78835,0.744141,0.767834,0.018167,3,1.0,0.998053,0.998058,0.998704,0.000917
3,52.719727,8.2684,0.023386,0.005521,400,8,0.05,"{'n_estimators': 400, 'max_depth': 8, 'learning_rate': 0.05}",0.745631,0.778641,0.761719,0.761997,0.013491,10,1.0,0.998053,0.998058,0.998704,0.000917
4,25.733291,0.172211,0.014068,0.001365,400,3,0.1,"{'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.1}",0.768932,0.784466,0.742188,0.76524,0.017449,6,1.0,0.998053,0.998058,0.998704,0.000917
5,4.679662,0.032846,0.004349,0.000496,50,4,0.25,"{'n_estimators': 50, 'max_depth': 4, 'learning_rate': 0.25}",0.770874,0.792233,0.75,0.771077,0.017234,2,1.0,0.998053,0.998058,0.998704,0.000917
6,3.349383,0.037126,0.004185,0.001052,50,3,0.15,"{'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.15}",0.765049,0.786408,0.742188,0.764591,0.018047,9,0.982473,0.983447,0.987379,0.984433,0.002121
7,18.063109,0.229219,0.011752,0.003073,200,4,0.15,"{'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.15}",0.76699,0.784466,0.748047,0.766537,0.014864,5,1.0,0.998053,0.998058,0.998704,0.000917
8,11.46565,0.16004,0.010648,0.006001,50,8,0.15,"{'n_estimators': 50, 'max_depth': 8, 'learning_rate': 0.15}",0.753398,0.780583,0.75,0.761349,0.01369,11,1.0,0.998053,0.998058,0.998704,0.000917
9,4.636918,0.022885,0.006923,0.006694,50,4,0.1,"{'n_estimators': 50, 'max_depth': 4, 'learning_rate': 0.1}",0.757282,0.778641,0.759766,0.76524,0.009544,6,1.0,0.994158,0.997087,0.997082,0.002385


In [29]:
gbRSCV.best_estimator_.score(vectTexts_test, y_test)

0.7267441860465116

In [62]:
from sklearn.svm import SVC

sksvc = SVC(verbose=1, gamma=0.1, tol=1e-5, C=2, kernel='rbf')
sksvc.fit(vectTexts_train, y_train)

[LibSVM]

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=1e-05, verbose=1)

In [63]:
print('Train score: {} Test score: {}'.format(sksvc.score(vectTexts_train, y_train),sksvc.score(vectTexts_test, y_test)))

Train score: 0.9870298313878081 Test score: 0.75


In [85]:
import scipy.stats

svmParams = { #'verbose' : [1],
             'gamma': scipy.stats.uniform(0.005,0.15),#[0.1,0.01,0.02,0.04,0.08],  
             'C' : scipy.stats.uniform(0.01,50),#[0.1,10,15, 20,25,40], 
             'shrinking'    :[True, False]}
svmRSCV = RandomizedSearchCV(sksvc, svmParams, verbose=1, return_train_score=True, n_iter=120) #, n_jobs=-1)
svmRSCV.fit(vectTexts_train, y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  4.3min finished


[LibSVM]

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=1e-05, verbose=1),
          fit_params=None, iid=True, n_iter=120, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018922952080>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000189229525C0>, 'shrinking': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [88]:
pd.DataFrame(svmRSCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.490553,0.008267,0.130203,7.365527e-03,19.2625,0.154259,True,"{'C': 19.26248752837037, 'gamma': 0.15425898456052756, 'shrinking': True}",0.739806,0.743689,0.730469,0.738003,0.005544,114,1.000000,0.998053,0.998058,0.998704,0.000917
1,0.357690,0.013125,0.098955,7.365527e-03,47.4678,0.0407651,True,"{'C': 47.467764705122015, 'gamma': 0.04076509881741229, 'shrinking': True}",0.778641,0.803883,0.767578,0.783398,0.015193,22,1.000000,0.998053,0.998058,0.998704,0.000917
2,0.358646,0.004679,0.114579,7.365639e-03,45.1782,0.0746224,False,"{'C': 45.17822268576651, 'gamma': 0.07462240280077816, 'shrinking': False}",0.765049,0.796117,0.765625,0.775616,0.014519,49,1.000000,0.998053,0.998058,0.998704,0.000917
3,0.400697,0.005669,0.114579,7.365358e-03,35.2666,0.101341,False,"{'C': 35.266567421761586, 'gamma': 0.10134095762731304, 'shrinking': False}",0.753398,0.768932,0.738281,0.753567,0.012508,79,1.000000,0.998053,0.998058,0.998704,0.000917
4,0.271918,0.004669,0.097434,5.215195e-03,27.4638,0.026813,False,"{'C': 27.463828618924303, 'gamma': 0.026813031995736276, 'shrinking': False}",0.790291,0.801942,0.763672,0.785344,0.016004,7,0.999026,0.993184,0.997087,0.996433,0.002430
5,0.262663,0.006621,0.088538,7.365583e-03,49.3714,0.0220207,True,"{'C': 49.371396677766676, 'gamma': 0.022020693132256117, 'shrinking': True}",0.766990,0.788350,0.759766,0.771725,0.012136,58,0.999026,0.995131,0.998058,0.997405,0.001656
6,0.447708,0.005450,0.109370,7.018853e-07,29.2905,0.0689445,True,"{'C': 29.290475010392882, 'gamma': 0.06894445193558148, 'shrinking': True}",0.776699,0.800000,0.769531,0.782101,0.013008,33,1.000000,0.998053,0.998058,0.998704,0.000917
7,0.477364,0.007812,0.119788,7.365808e-03,28.9505,0.102491,True,"{'C': 28.950492540808934, 'gamma': 0.1024906153826446, 'shrinking': True}",0.755340,0.765049,0.738281,0.752918,0.011056,81,1.000000,0.998053,0.998058,0.998704,0.000917
8,0.342583,0.005131,0.109371,2.973602e-07,41.9842,0.0639297,False,"{'C': 41.984178947732715, 'gamma': 0.06392974884132539, 'shrinking': False}",0.774757,0.807767,0.767578,0.783398,0.017503,22,1.000000,0.998053,0.998058,0.998704,0.000917
9,0.355870,0.008014,0.093746,3.371748e-07,14.8467,0.0350516,True,"{'C': 14.846698448499513, 'gamma': 0.03505164225045203, 'shrinking': True}",0.786408,0.801942,0.763672,0.784047,0.015705,15,0.997079,0.991237,0.996117,0.994811,0.002558


In [89]:
svmRSCV.best_estimator_.score(vectTexts_test, y_test)

0.8081395348837209