# Sentiment Analysis

Read the Foursquare annotated comments in Brazilian Portuguese

In [1]:
import pandas as pd
pd.set_option('max_colwidth',150)

In [2]:
df = pd.read_csv('docs/tips_scenario1_train.csv')
df.head(16)

Unnamed: 0,texto,rotulo
0,"A comida é deliciosa, mas pedi limonada suiça e me disseram que hoje estavam todos muito ocupados e que ninguém conseguiria me atender....melhor i...",-1.0
1,"A partir desta sexta feira dia 11 começam a abrir para jantar mas corre pois é só até as 22 hrs e no domingo dia das mães, estarão aberto durante ...",0.0
2,Joint burguer e brewdog,0.0
3,Agora de segunda a sexta o Habanero vai abrir no almoço com pratos mexicanos e tradicionais!,0.0
4,"Experimente o drink ""Dona Diabla"". Muito bom!",1.0
5,Nova senha do Wifi: 1129508219,0.0
6,Wi-fi 1129508219,0.0
7,"Adoramos a pizza carbonara e a paulistana. Não surpreendeu tanto, mas vale a pena por resgatar o tradicionalismo. Dica @Gourmet_For",1.0
8,"O diferencial desse Burger King é que você mesmo serve o refrigerante, e a vontade!",1.0
9,Unico defeito estacionamento pago!,-1.0


In [3]:
import re
def splitWithPunctuation(text):
    return re.findall(r"[\w']+|[.,!?;:\"]", text)    

In [4]:
splitWithPunctuation('mas que: "legal"')

['mas', 'que', ':', '"', 'legal', '"']

In [5]:
df.shape

(1714, 2)

# Baseline: Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
df = df.fillna(0)

In [7]:
texts = df['texto'].astype(str).tolist()
categs = df['rotulo'].tolist()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(texts, categs, test_size=0.1, random_state=42)

In [9]:
countVec = CountVectorizer()
vectTexts_train = countVec.fit_transform(X_train)
vectTexts_test = countVec.transform(X_test)

In [10]:
vectTexts_train

<1542x4781 sparse matrix of type '<class 'numpy.int64'>'
	with 25423 stored elements in Compressed Sparse Row format>

In [11]:
mnb = MultinomialNB()
mnb.fit(vectTexts_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
mnb.score(vectTexts_train, y_train)

0.9137483787289234

In [13]:
mnb.score(vectTexts_test, y_test)

0.7790697674418605

# Word Embedding Class

In [14]:
from Embeddings import WordEmbeddingBR, splitWithPunctuation
import numpy as np

In [2]:
WordEmbeddingBR.downloadNILCEmbeddings()
WordEmbeddingBR.getAvailableEmbeddings()

Downloading NILC word embeddings. More available at http://nilc.icmc.usp.br/embeddings
Downloading glove50 from http://143.107.183.175:22980/download.php?file=embeddings/glove/glove_s50.zip


178KKB [1:15:44, 39.2KB/s]                                                                    


Downloading cbow50_wang2vec from http://143.107.183.175:22980/download.php?file=embeddings/wang2vec/cbow_s50.zip


179KKB [50:24, 59.3KB/s]                                                                      


Downloading cbow50_fasttext from http://143.107.183.175:22980/download.php?file=embeddings/fasttext/cbow_s50.zip


  1%|▍                                                    | 1.29K/155K [00:25<44:31, 57.4KB/s]

KeyboardInterrupt: 

  1%|▍                                                  | 1.29K/155K [00:40<1:19:10, 32.3KB/s]

In [15]:
wee = WordEmbeddingBR('cbow50_wang2vec')

Reading embedding file: cbow50_wang2vec.zip


934967it [00:53, 17362.67it/s]


In [29]:
maxlen = max([len(x) for x in X_train])
def getSentenceVector(x, emb):
    wordArray = splitWithPunctuation(x)
    ans = np.zeros( (maxlen, emb.embDim) )
    for i,w in enumerate(wordArray):
        ans[i] = emb.encodeWord(w.lower())
        ans[i] /= (np.linalg.norm(ans[i])+1e-4)
        
    return ans

In [34]:
X_train[2]

'Muito caro pelo tamanho e sabor. Fomos em dois e gastamos R$60,00. Para fast food de tamanho normal, foi muito.'

In [38]:
getSentenceVector(X_train[2], wee)[0:10]

array([[-5.05464027e-02,  2.78965577e-02, -5.00163159e-03,
        -1.22586421e-01,  1.40743375e-01, -2.89156209e-01,
        -9.89202798e-02, -7.40342671e-02, -1.33117794e-01,
        -1.21884777e-01,  2.06737795e-02,  5.87740136e-02,
         2.26668426e-02, -2.09829375e-01,  3.32079798e-01,
         1.62030253e-02,  7.72076725e-02,  3.35773012e-02,
         2.33533860e-01,  4.39359321e-02, -4.30212260e-02,
         5.66354830e-02,  2.75821028e-01, -7.38374116e-02,
         7.16115215e-02,  1.02926170e-01, -7.39445357e-02,
         6.39970088e-02,  2.45025793e-01, -9.81186273e-02,
         9.45985507e-02,  8.40323915e-02, -1.05096324e-02,
         6.17323800e-02,  2.47888104e-01, -3.83512445e-03,
         8.51099579e-02,  6.55252082e-02,  4.06108531e-01,
        -1.97363047e-01, -1.65714771e-01,  1.85374626e-01,
        -1.17492686e-01,  1.13006911e-01,  1.08819976e-02,
        -7.86623476e-02, -1.27782931e-01,  1.15749448e-01,
         4.12016962e-02,  5.72331647e-02],
       [-2.48

In [41]:
vectTexts_train = [getSentenceVector(x, wee).reshape((-1,)) for x in X_train]
vectTexts_test = [getSentenceVector(x, wee).reshape((-1,)) for x in X_test]
getSentenceVector(X_train[0], wee)

array([[ 0.13147829, -0.15114349, -0.08199338, ..., -0.17250816,
        -0.00402809, -0.11158497],
       [ 0.09640924,  0.04780224,  0.0055655 , ...,  0.07664581,
         0.07899589,  0.08604658],
       [-0.23149888, -0.06592918,  0.06388329, ..., -0.07717097,
         0.29113467, -0.15354034],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
#try some of scikit learn classifiers
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
gbc = GradientBoostingClassifier(verbose=1, learning_rate=0.01, n_estimators=800, max_depth=4)
gbc.fit(vectTexts_train, y_train)

      Iter       Train Loss   Remaining Time 
         1        1436.2329           25.22m
         2        1427.9516           25.29m
         3        1419.7201           25.37m
         4        1411.7423           25.33m
         5        1403.7663           25.33m
         6        1395.9774           25.32m
         7        1388.3512           25.26m
         8        1380.7659           25.40m
         9        1373.4924           26.28m
        10        1366.1588           26.48m
        20        1295.3412           28.30m
        30        1228.3983           27.99m
        40        1170.1145           26.97m
        50        1117.3483           26.29m
        60        1069.6253           25.72m
        70        1026.0009           25.20m
        80         987.7744           24.82m
        90         951.8734           24.52m
       100         918.2048           24.46m
       200         700.3390           20.65m
       300         573.0794           16.70m
       40

In [None]:
print('Train score: {} Test score: {}'.format(gbc.score(vectTexts_train, y_train),gbc.score(vectTexts_test, y_test)))

In [53]:
gbParams = { #'verbose' : [1],
             'learning_rate':[0.1,0.01,0.001],  
             'n_estimators' :[10, 50, 100, 200], 
             'max_depth'    :[3,4,5]}
gbRSCV = RandomizedSearchCV(gbc, gbParams, verbose=1) #, n_jobs=-1)
gbRSCV.fit(vectTexts_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
      Iter       Train Loss   Remaining Time 
         1         911.6240            4.97s
         2         868.0945            5.63s
         3         830.5351            6.06s
         4         797.2125            5.87s
         5         768.6529            5.90s
         6         742.6890            6.02s
         7         721.1076            5.85s
         8         699.8268            5.76s
         9         681.4205            5.76s
        10         662.7929            5.67s
        20         543.6652            4.74s
        30         464.4371            4.22s
        40         404.8558            3.91s
        50         359.7200            3.84s
        60         322.4471            3.53s
        70         290.7074            3.25s
        80         262.9695            3.01s
        90         235.9893            2.70s
       100         213.4610            2.43s
       200          91.1978           

         7         582.0981            9.44s
         8         547.6361            9.45s
         9         516.5319            9.47s
        10         484.9291            9.42s
        20         293.5168            8.17s
        30         197.9318            7.39s
        40         136.9017            6.80s
        50         100.0028            6.27s
        60          74.0695            5.74s
        70          56.3086            5.26s
        80          43.5444            4.82s
        90          34.4314            4.40s
       100          27.7283            3.97s
       200           6.0209            0.00s
      Iter       Train Loss   Remaining Time 
         1         961.2794            0.32s
         2         960.6352            0.29s
         3         959.9922            0.26s
         4         959.3505            0.22s
         5         958.7086            0.19s
         6         958.0694            0.15s
         7         957.4316            0.11s
         

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.7min finished


      Iter       Train Loss   Remaining Time 
         1        1372.1220            6.95s
         2        1310.2252            7.56s
         3        1258.1945            7.70s
         4        1214.3457            7.85s
         5        1172.9817            7.80s
         6        1137.6678            7.81s
         7        1105.3929            7.75s
         8        1077.6997            8.01s
         9        1050.0682            7.95s
        10        1025.6831            7.78s
        20         862.7868            6.54s
        30         758.8188            5.86s
        40         686.2422            5.38s
        50         625.1313            4.95s
        60         576.0591            4.54s
        70         526.6949            4.19s
        80         487.6930            3.83s
        90         452.2790            3.50s
       100         423.9086            3.16s
       200         217.7912            0.00s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [10, 50, 100, 200], 'max_depth': [3, 4, 5]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [54]:
pd.DataFrame(gbRSCV.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,4.373492,0.091101,0.011978,0.000705,200,3,0.1,"{'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}",0.728155,0.741748,0.730469,0.733463,0.005942,1,0.999026,0.998053,0.997087,0.998055,0.000792
1,7.772204,0.248362,0.017135,0.000622,200,5,0.001,"{'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.001}",0.68932,0.695146,0.689453,0.69131,0.002717,5,0.718598,0.706913,0.71068,0.712064,0.00487
2,0.50687,0.01746,0.004325,0.000235,10,5,0.001,"{'n_estimators': 10, 'max_depth': 5, 'learning_rate': 0.001}",0.681553,0.681553,0.683594,0.682231,0.000961,6,0.682571,0.682571,0.681553,0.682232,0.00048
3,1.604947,0.018219,0.005989,1e-06,50,4,0.001,"{'n_estimators': 50, 'max_depth': 4, 'learning_rate': 0.001}",0.681553,0.681553,0.683594,0.682231,0.000961,6,0.682571,0.682571,0.681553,0.682232,0.00048
4,2.030638,0.043805,0.006654,0.000624,50,5,0.001,"{'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.001}",0.681553,0.681553,0.683594,0.682231,0.000961,6,0.682571,0.682571,0.681553,0.682232,0.00048
5,4.314938,0.017325,0.011146,0.000236,200,3,0.01,"{'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01}",0.702913,0.720388,0.71875,0.714008,0.007885,4,0.817916,0.806232,0.82233,0.815493,0.006792
6,7.450524,0.096367,0.01647,0.000814,200,5,0.1,"{'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1}",0.72233,0.745631,0.707031,0.725032,0.015866,2,0.999026,0.998053,0.997087,0.998055,0.000792
7,0.409721,0.022981,0.003827,0.000471,10,4,0.001,"{'n_estimators': 10, 'max_depth': 4, 'learning_rate': 0.001}",0.681553,0.681553,0.683594,0.682231,0.000961,6,0.682571,0.682571,0.681553,0.682232,0.00048
8,5.698329,0.013964,0.013808,0.000848,200,4,0.1,"{'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1}",0.732039,0.735922,0.705078,0.724384,0.013704,3,0.999026,0.998053,0.997087,0.998055,0.000792
9,0.492895,0.00783,0.004325,0.000236,10,5,0.01,"{'n_estimators': 10, 'max_depth': 5, 'learning_rate': 0.01}",0.681553,0.681553,0.683594,0.682231,0.000961,6,0.682571,0.682571,0.681553,0.682232,0.00048


In [43]:
q = wee.encodeWord('tóquio')-wee.encodeWord('japão')+wee.encodeWord('alemanha')
q

array([-0.180629, -0.044899, -0.145163, -0.018974,  0.237544,  0.452695,
        0.004599, -0.091597, -0.993889,  0.360783, -0.386382, -0.512438,
       -0.658062,  0.420846,  0.162521,  0.553969,  0.099122,  0.755036,
       -0.471112,  0.830366,  0.08927 , -0.260255,  0.572333, -0.748436,
       -0.28951 ,  0.373733, -0.111502,  0.255532, -0.529217,  0.01092 ,
       -0.328379,  0.311962, -0.546522,  0.058784, -0.089845,  0.265421,
        0.26021 ,  0.060581,  0.492156, -0.294599,  0.00214 , -0.079839,
        0.056477,  0.535249, -0.26015 ,  0.226907, -1.349362,  0.774546,
        0.156021, -0.314995])

In [44]:
wee.wordFromEmbedding(q)

[{'berlim': 0.8206403394718784},
 {'zurique': 0.8176687002454563},
 {'frankfurt': 0.7990995590380496},
 {'munique': 0.7966431526280903},
 {'tóquio': 0.7872814224416539},
 {'bratislava': 0.7839682571121654},
 {'budapeste': 0.7817987924868405},
 {'viena': 0.7796727611854269},
 {'londres': 0.7750696819697582},
 {'cracóvia': 0.7741654426676957}]