# Sentiment Analysis

Download the Foursquare annotated comments in Brazilian Portuguese: https://www.kaggle.com/thaisalmeida/tips-foursquare/version/1

Place the files in subfolder 'docs/'

In [1]:
import pandas as pd
import preProcessing

from sklearn.model_selection import RandomizedSearchCV
pd.set_option('max_colwidth',150)

In [2]:
df = pd.read_csv('docs/tips_scenario1_train.csv')
df.head(16)

Unnamed: 0,texto,rotulo
0,"A comida é deliciosa, mas pedi limonada suiça e me disseram que hoje estavam todos muito ocupados e que ninguém conseguiria me atender....melhor i...",-1.0
1,"A partir desta sexta feira dia 11 começam a abrir para jantar mas corre pois é só até as 22 hrs e no domingo dia das mães, estarão aberto durante ...",0.0
2,Joint burguer e brewdog,0.0
3,Agora de segunda a sexta o Habanero vai abrir no almoço com pratos mexicanos e tradicionais!,0.0
4,"Experimente o drink ""Dona Diabla"". Muito bom!",1.0
5,Nova senha do Wifi: 1129508219,0.0
6,Wi-fi 1129508219,0.0
7,"Adoramos a pizza carbonara e a paulistana. Não surpreendeu tanto, mas vale a pena por resgatar o tradicionalismo. Dica @Gourmet_For",1.0
8,"O diferencial desse Burger King é que você mesmo serve o refrigerante, e a vontade!",1.0
9,Unico defeito estacionamento pago!,-1.0


In [3]:
preProcessing.clean_text('Este é um teste de 354 números! Mas que: "interessante".')

'este é um teste de 000 números ! mas que : interessante .'

In [4]:
preProcessing.splitWithPunctuation('mas que: "legal"')

['mas', 'que', ':', '"', 'legal', '"']

In [5]:
df.shape

(1714, 2)

# Baseline: Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
df = df.fillna(0)

In [7]:
texts = df['texto'].astype(str).tolist()
categs = df['rotulo'].tolist()
texts = [preProcessing.clean_text(t) for t in texts]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(texts, categs, test_size=0.1, random_state=42)

In [9]:
countVec = CountVectorizer()
vectTexts_train = countVec.fit_transform(X_train)
vectTexts_test = countVec.transform(X_test)

In [10]:
vectTexts_train

<1542x4708 sparse matrix of type '<class 'numpy.int64'>'
	with 25330 stored elements in Compressed Sparse Row format>

In [11]:
mnb = MultinomialNB()
mnb.fit(vectTexts_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
mnb.score(vectTexts_train, y_train)

0.9105058365758755

In [13]:
mnb.score(vectTexts_test, y_test)

0.7790697674418605

In [14]:
mnbParams = { #'verbose' : [1],
             'alpha':[0.001, 0.1,1,10, 100],  
             'fit_prior' :[True, False]}
mnbRSCV = RandomizedSearchCV(mnb, mnbParams, verbose=1, return_train_score=True) #, n_jobs=-1)
mnbRSCV.fit(vectTexts_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.1s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'alpha': [0.001, 0.1, 1, 10, 100], 'fit_prior': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [15]:
pd.DataFrame(mnbRSCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_fit_prior,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.004159,0.0002354605,0.000665,0.0002351794,True,0.001,"{'fit_prior': True, 'alpha': 0.001}",0.751456,0.784466,0.753906,0.763294,0.015026,3,0.988315,0.986368,0.98932,0.988001,0.001226
1,0.002994,1.123916e-07,0.000665,0.0002355729,False,0.001,"{'fit_prior': False, 'alpha': 0.001}",0.737864,0.772816,0.757812,0.756161,0.01433,4,0.991237,0.986368,0.98835,0.988651,0.001999
2,0.003161,0.0002352357,0.000665,0.0002350108,True,0.1,"{'fit_prior': True, 'alpha': 0.1}",0.735922,0.76699,0.693359,0.732166,0.030162,5,0.982473,0.979552,0.986408,0.982811,0.002809
3,0.002994,2.973602e-07,0.000665,0.0002353481,False,0.1,"{'fit_prior': False, 'alpha': 0.1}",0.693204,0.72233,0.675781,0.697147,0.019198,7,0.977605,0.976631,0.980583,0.978273,0.001681
4,0.003327,0.0002356855,0.000832,0.0002350109,True,1.0,"{'fit_prior': True, 'alpha': 1}",0.786408,0.794175,0.787109,0.789235,0.00351,1,0.914314,0.906524,0.909709,0.910182,0.003198
5,0.006155,0.0006223965,0.001331,0.0002351799,False,1.0,"{'fit_prior': False, 'alpha': 1}",0.778641,0.801942,0.777344,0.785992,0.011307,2,0.927945,0.925998,0.928155,0.927366,0.000971
6,0.005991,0.001223391,0.001163,0.0004677745,True,10.0,"{'fit_prior': True, 'alpha': 10}",0.68932,0.685437,0.693359,0.689364,0.003233,8,0.702045,0.696203,0.692233,0.696827,0.00403
7,0.003992,0.001077286,0.000499,1.94668e-07,False,10.0,"{'fit_prior': False, 'alpha': 10}",0.699029,0.700971,0.712891,0.70428,0.006122,6,0.740019,0.717624,0.720388,0.726011,0.00997
8,0.003992,0.0008150747,0.000998,5.61958e-07,True,100.0,"{'fit_prior': True, 'alpha': 100}",0.681553,0.681553,0.683594,0.682231,0.000961,10,0.682571,0.682571,0.681553,0.682232,0.00048
9,0.004658,0.001025704,0.000499,2.973602e-07,False,100.0,"{'fit_prior': False, 'alpha': 100}",0.685437,0.683495,0.689453,0.686122,0.002479,9,0.694255,0.689387,0.68932,0.690987,0.002311


In [16]:
mnbRSCV.best_estimator_.score(vectTexts_test, y_test)

0.7790697674418605

# Word Embedding Class

In [17]:
from Embeddings import WordEmbeddingBR, splitWithPunctuation
import numpy as np

In [19]:
WordEmbeddingBR.downloadNILCEmbeddings()
WordEmbeddingBR.getAvailableEmbeddings()

['cbow50_wang2vec', 'glove50']

In [20]:
wee = WordEmbeddingBR('skip_s300_word2vec')

Reading embedding file: cbow50_wang2vec.zip


934967it [00:56, 16562.14it/s]


In [21]:
maxlen = max([len(x) for x in X_train])
def getSentenceVector(x, emb):
    wordArray = splitWithPunctuation(x)
    ans = np.zeros( (maxlen, emb.embDim) )
    for i,w in enumerate(wordArray):
        if len(w) > 2:
            ans[i] = emb.encodeWord(w.lower())
            ans[i] /= (np.linalg.norm(ans[i])+1e-4)
        
    return np.sum(ans,axis=0)

In [22]:
X_train[2]

'muito caro pelo tamanho e sabor . fomos em dois e gastamos r$00,00 . para fast food de tamanho normal , foi muito .'

In [23]:
getSentenceVector(X_train[2], wee).shape

(50,)

In [24]:
X_train[0]

'achei a comida bem medíocre . prato com muitas coisas , mas nada com sabor . não vale o que custa .'

In [25]:
vectTexts_train = [getSentenceVector(x, wee).reshape((-1,)) for x in X_train]
vectTexts_test = [getSentenceVector(x, wee).reshape((-1,)) for x in X_test]
getSentenceVector(X_train[0], wee)

array([-0.54267877, -2.4083582 ,  0.05858124, -0.50801632,  0.2093675 ,
       -2.41057827, -0.99857627, -0.35532332, -0.81334581, -0.72056427,
        1.39481846, -0.28752871, -0.28190608, -2.01494058,  0.91679327,
        0.15896422,  2.66689752, -1.15763982,  2.09417567,  0.91922997,
       -1.35736812,  1.84102141,  1.33348798, -1.53619593,  0.12388522,
        0.10981601, -0.39186563,  2.3167655 ,  1.03957657, -0.83999203,
       -0.75009092,  0.14593409, -0.88529186,  0.09057839,  1.9714189 ,
        0.65833036,  0.70094116,  0.80373321,  0.99129762, -1.38716639,
       -0.70253531,  0.66281073, -0.45017885,  1.34422549,  2.04371999,
       -0.03738796, -1.20501453, -0.53624573, -0.52358649, -0.71016648])

In [26]:
#try some of scikit learn classifiers
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

gbc = GradientBoostingClassifier(verbose=1, learning_rate=0.1, n_estimators=320, max_depth=6)
gbc.fit(vectTexts_train, y_train)

      Iter       Train Loss   Remaining Time 
         1        1320.3346           28.02s
         2        1209.6517           26.19s
         3        1115.1603           27.26s
         4        1037.1981           27.16s
         5         962.9793           27.60s
         6         903.4017           27.00s
         7         844.5276           27.13s
         8         795.1224           27.15s
         9         749.2693           27.33s
        10         704.9255           27.48s
        20         412.8127           25.80s
        30         273.1831           23.58s
        40         186.6105           22.10s
        50         131.8634           21.31s
        60          95.7391           20.37s
        70          71.1971           19.79s
        80          54.2056           18.85s
        90          41.3895           17.91s
       100          32.0871           17.25s
       200           6.3685            8.95s
       300           4.5454            1.36s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=320,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

In [27]:
print('Train score: {} Test score: {}'.format(gbc.score(vectTexts_train, y_train),gbc.score(vectTexts_test, y_test)))

Train score: 0.9980544747081712 Test score: 0.7616279069767442


In [28]:
gbParams = { #'verbose' : [1],
             'learning_rate':[0.1,0.05,0.15,0.25],  
             'n_estimators' :[50, 100, 200, 400], 
             'max_depth'    :[3,4,5,6,7,8]}
gbRSCV = RandomizedSearchCV(gbc, gbParams, verbose=1, return_train_score=True, n_iter=12) #, n_jobs=-1)
gbRSCV.fit(vectTexts_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
      Iter       Train Loss   Remaining Time 
         1         911.7711           12.91s
         2         866.9821           13.04s
         3         825.9867           12.22s
         4         787.9908           11.81s
         5         753.7134           11.81s
         6         722.1049           11.62s
         7         692.0052           11.85s
         8         664.0919           11.81s
         9         636.7105           11.72s
        10         611.2379           11.90s
        20         431.1345           10.53s
        30         309.5752            9.80s
        40         230.8327            8.93s
        50         178.7208            8.21s
        60         138.3163            7.57s
        70         110.2369            6.95s
        80          90.4521            6.35s
        90          74.8349            5.77s
       100          61.5676            5.22s
       200          10.6359           

         9         329.1382           25.50s
        10         299.5150           25.75s
        20         136.5623           23.74s
        30          74.9230           21.62s
        40          42.4078           21.07s
        50          24.6338           20.01s
        60          14.5993           18.99s
        70           9.0301           18.40s
        80           5.4786           17.74s
        90           3.4187           16.87s
       100           2.1518           16.17s
       200           0.2903            7.45s
       300           0.2903            2.57s
       400           0.2903            0.00s
      Iter       Train Loss   Remaining Time 
         1         828.7039           22.90s
         2         728.5044           24.93s
         3         642.4997           24.04s
         4         572.1883           26.58s
         5         508.2911           27.32s
         6         457.6240           27.07s
         7         411.9886           26.95s
         

         6         504.9476           29.76s
         7         455.7425           30.26s
         8         413.7717           30.32s
         9         375.9766           30.33s
        10         341.7176           30.36s
        20         148.6020           28.10s
        30          71.5428           26.31s
        40          38.5137           24.69s
        50          22.0967           23.73s
        60          13.6319           22.73s
        70           9.2042           21.84s
        80           6.5475           21.16s
        90           5.0127           20.31s
       100           4.0479           19.83s
       200           2.9359           10.28s
       300           2.8461            4.26s
       400           2.8057            0.00s
      Iter       Train Loss   Remaining Time 
         1         708.4170           15.10s
         2         523.6427           16.35s
         3         403.7190           17.40s
         4         311.1610           17.63s
         

         8         475.3743            2.56s
         9         444.8124            2.50s
        10         412.0239            2.44s
        20         207.7108            1.73s
        30         121.6741            1.11s
        40          75.9779            0.54s
        50          49.9147            0.00s
      Iter       Train Loss   Remaining Time 
         1         918.4076           10.13s
         2         879.1327           10.18s
         3         843.4398            9.70s
         4         810.8774            9.66s
         5         780.9824            9.50s
         6         752.8327            9.39s
         7         726.7030            9.37s
         8         701.5674            9.32s
         9         678.9798            9.26s
        10         657.8733            9.19s
        20         487.7309            7.95s
        30         381.7082            7.19s
        40         310.8836            6.72s
        50         256.4365            6.30s
        6

         5         598.2801           14.47s
         6         563.0051           13.89s
         7         533.1257           13.34s
         8         505.7751           12.91s
         9         481.4549           12.68s
        10         461.7804           12.57s
        20         323.1508           10.74s
        30         244.1836           10.04s
        40         190.9428            9.27s
        50         149.3276            8.79s
        60         118.9443            8.35s
        70          97.6773            8.12s
        80          81.3555            7.82s
        90          67.4391            7.62s
       100          55.9162            7.28s
       200          11.5267            4.30s
       300           4.6067            2.06s
       400           3.1792            0.00s


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  4.4min finished


      Iter       Train Loss   Remaining Time 
         1        1268.6124            3.90s
         2        1146.5145            3.72s
         3        1059.9787            3.74s
         4         996.5656            3.58s
         5         939.6297            3.55s
         6         892.3967            3.49s
         7         854.5186            3.51s
         8         821.3824            3.50s
         9         792.2583            3.44s
        10         765.8411            3.45s
        20         558.7046            2.90s
        30         443.4323            2.50s
        40         360.9052            2.08s
        50         303.5657            1.71s
        60         258.3610            1.34s
        70         220.5624            0.99s
        80         186.8658            0.65s
        90         160.0936            0.33s
       100         139.1432            0.00s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=320,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False),
          fit_params=None, iid=True, n_iter=12, n_jobs=1,
          param_distributions={'learning_rate': [0.1, 0.05, 0.15, 0.25], 'n_estimators': [50, 100, 200, 400], 'max_depth': [3, 4, 5, 6, 7, 8]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [29]:
pd.DataFrame(gbRSCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,10.520084,0.216704,0.019131,0.00094,200,6,0.05,"{'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05}",0.739806,0.75534,0.726562,0.740597,0.011756,10,1.0,0.998053,0.998058,0.998704,0.000917
1,2.105207,0.048376,0.006821,0.000235,50,5,0.25,"{'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.25}",0.733981,0.745631,0.730469,0.736706,0.006481,11,1.0,0.998053,0.998058,0.998704,0.000917
2,7.958127,0.53473,0.016138,0.001697,200,6,0.15,"{'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.15}",0.726214,0.763107,0.734375,0.741245,0.015835,9,1.0,0.998053,0.998058,0.998704,0.000917
3,10.781545,1.971219,0.020627,0.003999,400,6,0.15,"{'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.15}",0.728155,0.768932,0.736328,0.744488,0.017628,4,1.0,0.998053,0.998058,0.998704,0.000917
4,8.341725,1.322557,0.016801,0.00201,400,4,0.25,"{'n_estimators': 400, 'max_depth': 4, 'learning_rate': 0.25}",0.749515,0.776699,0.720703,0.749027,0.022852,2,1.0,0.998053,0.998058,0.998704,0.000917
5,13.532249,2.239174,0.025285,0.00524,400,7,0.1,"{'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.1}",0.732039,0.763107,0.738281,0.744488,0.013429,4,1.0,0.998053,0.998058,0.998704,0.000917
6,6.416456,1.425685,0.012975,0.002852,200,8,0.25,"{'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.25}",0.737864,0.76699,0.728516,0.744488,0.016385,4,1.0,0.998053,0.998058,0.998704,0.000917
7,6.087736,0.314287,0.013142,0.000623,200,4,0.1,"{'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1}",0.745631,0.76699,0.71875,0.743839,0.019725,8,1.0,0.998053,0.998058,0.998704,0.000917
8,2.649791,0.031695,0.007653,0.000471,50,6,0.1,"{'n_estimators': 50, 'max_depth': 6, 'learning_rate': 0.1}",0.718447,0.75534,0.722656,0.732166,0.0165,12,1.0,0.998053,0.998058,0.998704,0.000917
9,7.710826,0.183527,0.015804,0.000849,200,5,0.05,"{'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05}",0.743689,0.763107,0.726562,0.744488,0.014923,4,1.0,0.998053,0.998058,0.998704,0.000917


In [30]:
gbRSCV.best_estimator_.score(vectTexts_test, y_test)

0.7267441860465116

In [31]:
from sklearn.svm import SVC

sksvc = SVC(verbose=1, gamma=0.1, tol=1e-5, C=2, kernel='rbf')
sksvc.fit(vectTexts_train, y_train)

[LibSVM]

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=1e-05, verbose=1)

In [32]:
print('Train score: {} Test score: {}'.format(sksvc.score(vectTexts_train, y_train),sksvc.score(vectTexts_test, y_test)))

Train score: 0.9766536964980544 Test score: 0.7325581395348837


In [33]:
import scipy.stats

svmParams = { #'verbose' : [1],
             'gamma': scipy.stats.uniform(0.005,0.15),#[0.1,0.01,0.02,0.04,0.08],  
             'C' : scipy.stats.uniform(0.01,50),#[0.1,10,15, 20,25,40], 
             'shrinking'    :[True, False]}
svmRSCV = RandomizedSearchCV(sksvc, svmParams, verbose=1, return_train_score=True, n_iter=120) #, n_jobs=-1)
svmRSCV.fit(vectTexts_train, y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  1.8min finished


[LibSVM]

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=1e-05, verbose=1),
          fit_params=None, iid=True, n_iter=120, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000023BAD919940>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000023BAD919E80>, 'shrinking': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [34]:
pd.DataFrame(svmRSCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.206772,0.016503,0.037929,0.001078,26.8431,0.0625746,True,"{'C': 26.843146649758435, 'gamma': 0.06257463787001483, 'shrinking': True}",0.745631,0.768932,0.757812,0.757458,0.009525,47,1.000000,0.998053,0.997087,0.998380,0.001211
1,0.195129,0.004592,0.035267,0.002009,22.4475,0.0363221,False,"{'C': 22.44749386324993, 'gamma': 0.036322051852327356, 'shrinking': False}",0.735922,0.766990,0.759766,0.754215,0.013285,60,0.993184,0.990263,0.993204,0.992217,0.001382
2,0.205608,0.008843,0.038427,0.000816,38.0444,0.0670948,True,"{'C': 38.04436336060919, 'gamma': 0.06709482544061485, 'shrinking': True}",0.745631,0.768932,0.767578,0.760700,0.010685,38,1.000000,0.998053,0.998058,0.998704,0.000917
3,0.197146,0.005944,0.031607,0.000848,32.0874,0.0249578,False,"{'C': 32.08740962348253, 'gamma': 0.024957754470274603, 'shrinking': False}",0.724272,0.774757,0.763672,0.754215,0.021681,60,0.988315,0.983447,0.991262,0.987675,0.003223
4,0.186367,0.003871,0.041255,0.001026,37.8021,0.153958,False,"{'C': 37.80214090102072, 'gamma': 0.15395843390724673, 'shrinking': False}",0.735922,0.739806,0.724609,0.733463,0.006441,119,1.000000,0.998053,0.998058,0.998704,0.000917
5,0.201326,0.004043,0.040091,0.000624,32.3605,0.11183,True,"{'C': 32.36054590904825, 'gamma': 0.11183037825704656, 'shrinking': True}",0.741748,0.770874,0.742188,0.751621,0.013635,74,1.000000,0.998053,0.998058,0.998704,0.000917
6,0.199745,0.003705,0.038593,0.001025,25.605,0.0993973,True,"{'C': 25.605038158792023, 'gamma': 0.09939727196250232, 'shrinking': True}",0.747573,0.765049,0.744141,0.752270,0.009157,71,1.000000,0.998053,0.998058,0.998704,0.000917
7,0.173878,0.015276,0.032272,0.000623,28.4612,0.0323739,True,"{'C': 28.46116634918059, 'gamma': 0.032373927693916706, 'shrinking': True}",0.730097,0.766990,0.757812,0.751621,0.015695,74,0.993184,0.989289,0.993204,0.991892,0.001841
8,0.205110,0.003259,0.039923,0.000406,27.4488,0.11082,True,"{'C': 27.44884857837742, 'gamma': 0.11081997427336616, 'shrinking': True}",0.741748,0.770874,0.742188,0.751621,0.013635,74,1.000000,0.998053,0.998058,0.998704,0.000917
9,0.164664,0.004749,0.032106,0.000622,9.66537,0.0290361,False,"{'C': 9.665370580210976, 'gamma': 0.029036064169539157, 'shrinking': False}",0.747573,0.798058,0.765625,0.770428,0.020907,10,0.963973,0.966894,0.967961,0.966276,0.001686


In [35]:
svmRSCV.best_estimator_.score(vectTexts_test, y_test)

0.7558139534883721

In [89]:
svmRSCV.best_estimator_.score(vectTexts_test, y_test)

0.8081395348837209