# Sentiment analysis
Read https://web.stanford.edu/~jurafsky/slp3/19.pdf

### Using Lexicon

In [1]:
from nlputils.lexical import Preprocessing
from nltk.corpus import stopwords
import re

stopwords = stopwords.words('portuguese')
normalizer = Preprocessing()

In [2]:
# download LIWC resource at http://143.107.183.175:21380/portlex/images/arquivos/liwc/LIWC2007_Portugues_win.dic.txt
# posemo = 126
# negemo = 127
# what more?
positives = []
negatives = []

with open('LIWC2007_Portugues_win.dic.txt', 'r', encoding='latin') as liwc_file:
    in_header = True
    for line in liwc_file.readlines():
        if not re.match('^\d+', line):
            parts = line.split()
            word = parts.pop(0)
            if '126' in parts:
                positives.append(word)
            elif '127' in parts:
                negatives.append(word)

In [3]:
'feliz' in positives

True

In [4]:
'triste' in negatives

True

In [5]:
def lexical_sentment_analysis(text, binary=False):
    text = normalizer.remove_punctuation(text)
    tokens = normalizer.tokenize_words(text)
    tokens = normalizer.remove_stopwords(tokens)
    
    polarity = 0
    
    for token in tokens:
        if token in positives:
            polarity += 1
        elif token in negatives:
            polarity -= 1
    if not binary:
        return polarity
    else:
        if polarity < 0:
            polarity = -1
        elif polarity > 0:
            polarity = 1
        
        return polarity

In [6]:
lexical_sentment_analysis('Eu estou muito triste e triste')

-2

### Training a classifier

In [7]:
# using dataset of IMDb, available at: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
import wget
import os
import tarfile
import pandas as pd

filename = "dataset/aclImdb_v1.tar.gz"


# Donwload data

dataset_link = "http://ai.stanford.edu/~amaas/data/sentiment/{}".format("aclImdb_v1.tar.gz")
try:
    os.mkdir("dataset")
except OSError:
    pass

if not os.path.isfile(filename):
    file = wget.download(dataset_link, out='dataset/aclImdb_v1.tar.gz')
    tar = tarfile.open(filename, "r:gz")
    tar.extractall("dataset")
    tar.close()


# read data

dataset_path = 'dataset/aclImdb'
train_positive_files = ['train/pos/'+f for f in os.listdir(dataset_path+'/train/pos') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/pos', f))]

train_negative_files = ['train/neg/'+f for f in os.listdir(dataset_path+'/train/neg') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/neg', f))]

test_positive_files = ['test/pos/'+f for f in os.listdir(dataset_path+'/test/pos') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/pos', f))]

test_negative_files = ['test/neg/'+f for f in os.listdir(dataset_path+'/test/neg') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/neg', f))]

all_files = list(set().union(train_positive_files,train_negative_files, test_positive_files, test_negative_files))

dataset = {'trainset':[], 'polarity':[], 'bin_polarity': [], 'review':[]}

for file in all_files:
    polarity = file.split('.')[0].split('_')[1]
    with open(os.path.join(dataset_path, file), 'r') as text_file:
        dataset['trainset'].append(file.split('/')[0])
        bin_polarity = 1 if int(polarity) > 5 else 0  # transform into binary polarity
        dataset['bin_polarity'].append(bin_polarity)
        dataset['polarity'].append(polarity)
        dataset['review'].append(text_file.readlines()[0])

        
# create dataframe

dataframe = pd.DataFrame(data=dataset)
dataframe.head()

Unnamed: 0,trainset,polarity,bin_polarity,review
0,test,10,1,This show is a show that is great for adults a...
1,test,1,0,This movie cannot be serious because it has a ...
2,test,8,1,This is a pretty good made for TV flick of the...
3,train,1,0,I've seen better production quality on YouTube...
4,train,7,1,"When it first came out, this work by the Meyse..."


#### Preprocessing

In [8]:
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

def preprocessing(text):
    text = normalizer.lowercase(text)
    text = normalizer.remove_punctuation(text)
    tokens = normalizer.tokenize_words(text)
    tokens = [token for token in tokens if token not in english_stopwords]
    return ' '.join(tokens)

dataframe['normalized_review'] = dataframe['review'].apply(preprocessing)
dataframe.head()

Unnamed: 0,trainset,polarity,bin_polarity,review,normalized_review
0,test,10,1,This show is a show that is great for adults a...,show show great adults children sit together w...
1,test,1,0,This movie cannot be serious because it has a ...,movie serious nerdy looking kid named curtis k...
2,test,8,1,This is a pretty good made for TV flick of the...,pretty good made tv flick variety terrorists e...
3,train,1,0,I've seen better production quality on YouTube...,ive seen better production quality youtube pit...
4,train,7,1,"When it first came out, this work by the Meyse...",first came work meysels brothers much criticiz...


### feature extraction

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
train_reviews = dataframe[dataframe['trainset'] == 'train']['normalized_review'].values.tolist()
train_classes = dataframe[dataframe['trainset'] == 'train']['bin_polarity'].values.tolist()
test_reviews = dataframe[dataframe['trainset'] == 'test']['normalized_review'].values.tolist()
test_classes = dataframe[dataframe['trainset'] == 'test']['bin_polarity'].values.tolist()

transformer = TfidfVectorizer()
transformer.fit(train_reviews)
X = transformer.transform(train_reviews)
X_test = transformer.transform(test_reviews)

### training classifier

In [25]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [12]:
classifier = SVC()
classifier.fit(X, train_classes)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [28]:
accuracy_score(test_classes, classifier.predict(X_test))

0.65392

In [24]:
classifier_lr = LogisticRegression()
classifier_lr.fit(X, train_classes)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
accuracy_score(test_classes, classifier_lr.predict(X_test))

0.88452

# using the classifier

In [30]:
sentence = "This film was really bad!"
preprocessed_sentence = preprocessing(sentence)
print(preprocessed_sentence)
instance = transformer.transform([preprocessing(sentence)])
print(instance)
classifier.predict(instance)

film really bad
  (0, 87080)	0.5961343855298462
  (0, 39451)	0.43855537677700446
  (0, 9716)	0.6725273049393103


array([0])

In [31]:
sentence = "Good film!"
preprocessed_sentence = preprocessing(sentence)
print(preprocessed_sentence)
instance = transformer.transform([preprocessing(sentence)])
print(instance)
classifier.predict(instance)

good film
  (0, 45171)	0.7750865021715609
  (0, 39451)	0.6318551369985488


array([1])