# Sentiment Analysis

Download the Foursquare annotated comments in Brazilian Portuguese: https://www.kaggle.com/thaisalmeida/tips-foursquare/version/1

Place the files in subfolder 'docs/'

In [None]:
#!wget files if using Google Colab
!wget -q https://raw.githubusercontent.com/douglas125/TextClassification/master/preProcessing.py
!wget -q https://raw.githubusercontent.com/douglas125/TextClassification/master/Embeddings.py
!wget -q https://raw.githubusercontent.com/douglas125/TextClassification/master/requirements.txt
!pip install -r requirements.txt


#move CSVs to docs/ folder
from google.colab import files
files.upload()

!mkdir docs
!mv *.csv docs/
!ls

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import preProcessing

from sklearn.model_selection import RandomizedSearchCV
pd.set_option('max_colwidth',150)

In [2]:
df = pd.read_csv('docs/tips_scenario1_train.csv')
df.head(16)

Unnamed: 0,texto,rotulo
0,"A comida é deliciosa, mas pedi limonada suiça e me disseram que hoje estavam todos muito ocupados e que ninguém conseguiria me atender....melhor i...",-1.0
1,"A partir desta sexta feira dia 11 começam a abrir para jantar mas corre pois é só até as 22 hrs e no domingo dia das mães, estarão aberto durante ...",0.0
2,Joint burguer e brewdog,0.0
3,Agora de segunda a sexta o Habanero vai abrir no almoço com pratos mexicanos e tradicionais!,0.0
4,"Experimente o drink ""Dona Diabla"". Muito bom!",1.0
5,Nova senha do Wifi: 1129508219,0.0
6,Wi-fi 1129508219,0.0
7,"Adoramos a pizza carbonara e a paulistana. Não surpreendeu tanto, mas vale a pena por resgatar o tradicionalismo. Dica @Gourmet_For",1.0
8,"O diferencial desse Burger King é que você mesmo serve o refrigerante, e a vontade!",1.0
9,Unico defeito estacionamento pago!,-1.0


In [3]:
preProcessing.clean_text('Este é um teste de 354 números! Mas que: "interessante".')

'este é um teste de 000 números ! mas que : interessante .'

In [4]:
preProcessing.splitWithPunctuation('mas que: "legal"')

['mas', 'que', ':', '"', 'legal', '"']

In [5]:
df.shape

(1714, 2)

# Baseline: Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
df = df.fillna(0)

In [7]:
texts = df['texto'].astype(str).tolist()
categs = df['rotulo'].tolist()
texts = [preProcessing.clean_text(t) for t in texts]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(texts, categs, test_size=0.1, random_state=42)

In [9]:
countVec = CountVectorizer()
vectTexts_train = countVec.fit_transform(X_train)
vectTexts_test = countVec.transform(X_test)

In [10]:
vectTexts_train

<1542x4708 sparse matrix of type '<class 'numpy.int64'>'
	with 25330 stored elements in Compressed Sparse Row format>

In [11]:
mnb = MultinomialNB()
mnb.fit(vectTexts_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
mnb.score(vectTexts_train, y_train)

0.9105058365758755

In [13]:
mnb.score(vectTexts_test, y_test)

0.7790697674418605

In [14]:
mnbParams = { #'verbose' : [1],
             'alpha':[0.001, 0.1,1,10, 100],  
             'fit_prior' :[True, False]}
mnbRSCV = RandomizedSearchCV(mnb, mnbParams, verbose=1, return_train_score=True) #, n_jobs=-1)
mnbRSCV.fit(vectTexts_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.1s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'alpha': [0.001, 0.1, 1, 10, 100], 'fit_prior': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [15]:
pd.DataFrame(mnbRSCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_fit_prior,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.00549,0.0004073427,0.000832,0.0002350675,True,0.001,"{'fit_prior': True, 'alpha': 0.001}",0.751456,0.784466,0.753906,0.763294,0.015026,3,0.988315,0.986368,0.98932,0.988001,0.001226
1,0.004658,0.0002352357,0.000665,0.000235067,False,0.001,"{'fit_prior': False, 'alpha': 0.001}",0.737864,0.772816,0.757812,0.756161,0.01433,4,0.991237,0.986368,0.98835,0.988651,0.001999
2,0.002994,5.947204e-07,0.000499,3.893359e-07,True,0.1,"{'fit_prior': True, 'alpha': 0.1}",0.735922,0.76699,0.693359,0.732166,0.030162,5,0.982473,0.979552,0.986408,0.982811,0.002809
3,0.003493,0.0007061566,0.000665,0.0002349547,False,0.1,"{'fit_prior': False, 'alpha': 0.1}",0.693204,0.72233,0.675781,0.697147,0.019198,7,0.977605,0.976631,0.980583,0.978273,0.001681
4,0.003826,0.0008477645,0.000665,0.0002347299,True,1.0,"{'fit_prior': True, 'alpha': 1}",0.786408,0.794175,0.787109,0.789235,0.00351,1,0.914314,0.906524,0.909709,0.910182,0.003198
5,0.003826,0.0006225663,0.000831,0.0002352356,False,1.0,"{'fit_prior': False, 'alpha': 1}",0.778641,0.801942,0.777344,0.785992,0.011307,2,0.927945,0.925998,0.928155,0.927366,0.000971
6,0.003494,0.0004079274,0.000665,0.000234786,True,10.0,"{'fit_prior': True, 'alpha': 10}",0.68932,0.685437,0.693359,0.689364,0.003233,8,0.702045,0.696203,0.692233,0.696827,0.00403
7,0.002994,1.94668e-07,0.000666,0.0002357976,False,10.0,"{'fit_prior': False, 'alpha': 10}",0.699029,0.700971,0.712891,0.70428,0.006122,6,0.740019,0.717624,0.720388,0.726011,0.00997
8,0.002994,3.371748e-07,0.000665,0.0002355168,True,100.0,"{'fit_prior': True, 'alpha': 100}",0.681553,0.681553,0.683594,0.682231,0.000961,10,0.682571,0.682571,0.681553,0.682232,0.00048
9,0.003327,0.0002362491,0.000666,0.0002351794,False,100.0,"{'fit_prior': False, 'alpha': 100}",0.685437,0.683495,0.689453,0.686122,0.002479,9,0.694255,0.689387,0.68932,0.690987,0.002311


In [16]:
mnbRSCV.best_estimator_.score(vectTexts_test, y_test)

0.7790697674418605

# Word Embedding Class

In [17]:
from Embeddings import WordEmbeddingBR, splitWithPunctuation
import numpy as np

In [19]:
WordEmbeddingBR.downloadNILCEmbeddings()
WordEmbeddingBR.getAvailableEmbeddings()

['cbow50_fasttext', 'cbow50_wang2vec', 'glove50']

In [21]:
wee = WordEmbeddingBR('cbow50_wang2vec')

Reading embedding file: cbow50_wang2vec.zip


934967it [00:59, 15727.38it/s]


In [26]:
classifiers = wee.TrainBaselineClassifiers(X_train, y_train, n_iter=4)

Fitting Support Vector Machine...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    7.9s finished


[LibSVM]Fitting Gradient Boosted Tree...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
      Iter       Train Loss   Remaining Time 
         1         658.5195            6.26s
         2         495.4146            6.12s
         3         380.5770            6.32s
         4         297.1064            6.83s
         5         243.7370            6.74s
         6         212.2517            6.74s
         7         179.5781            6.47s
         8         152.4104            6.27s
         9         132.1538            6.12s
        10         115.4280            5.95s
        20          37.9122            4.85s
        30          13.3994            4.19s
        40           5.1836            3.59s
        50           1.8979            3.04s
        60           0.8089            2.58s
        70           0.3634            2.07s
        80           0.2790            1.48s
        90           0.2790            0.95s
       100           0.2790            0.52

        20         347.1043           10.30s
        30         244.9993            9.07s
        40         182.6824            8.28s
        50         140.6383            7.60s
        60         109.2661            7.08s
        70          88.5923            6.55s
        80          71.3518            6.05s
        90          57.6216            5.58s
       100          47.1368            5.15s
       200           8.3875            1.06s
      Iter       Train Loss   Remaining Time 
         1         663.8706           37.10s
         2         475.0937           38.23s
         3         346.4510           39.12s
         4         251.0832           38.49s
         5         185.3167           38.60s
         6         141.1186           38.26s
         7         106.7624           38.85s
         8          79.6621           38.81s
         9          58.7069           38.76s
        10          43.7401           38.67s
        20           3.4129           34.41s
        3

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  1.4min finished


      Iter       Train Loss   Remaining Time 
         1        1046.0737            7.97s
         2         848.6318            7.89s
         3         705.4500            7.73s
         4         598.5587            7.87s
         5         497.5283            7.63s
         6         436.0509            7.43s
         7         381.8273            7.55s
         8         335.0726            7.40s
         9         299.2200            7.25s
        10         270.3118            7.29s
        20         105.2294            6.11s
        30          52.3799            5.49s
        40          26.7224            4.86s
        50          15.5067            4.24s
        60          10.4264            3.51s
        70           7.7755            2.80s
        80           6.2265            2.13s
        90           5.3279            1.50s
       100           4.8589            0.89s


In [28]:
wee.TestBaselineClassifiers(X_train, y_train, classifiers)

{'SVM': 0.9980544747081712, 'GradientBoostingClassifier': 0.9980544747081712}

In [27]:
wee.TestBaselineClassifiers(X_test, y_test, classifiers)

{'SVM': 0.7732558139534884, 'GradientBoostingClassifier': 0.7558139534883721}

In [89]:
svmRSCV.best_estimator_.score(vectTexts_test, y_test)

0.8081395348837209