## Importando Bibliotecas

In [43]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re

## Carregando a base de dados

In [44]:
dataset = pd.read_csv('data.csv')

In [45]:
dataset

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


## Explorando a base de dados

In [46]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB


In [47]:
dataset.describe()

Unnamed: 0,Sentence,Sentiment
count,5842,5842
unique,5322,3
top,Managing Director 's comments : `` Net sales f...,neutral
freq,2,3130


In [48]:
dataset.isnull().values.any()

False

## Tratando a base de dados

In [49]:
# removendo duplicatas
# dataset = dataset.drop_duplicates()
# dataset.head()

In [50]:
nltk.download('stopwords')
stop = stopwords.words('english')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pichau\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pichau\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pichau\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [51]:
def normalizer(texto):  
    texto = re.sub(r'-', ' ', texto) # para que palavras com "Finnish-Russian" se tornem "finnish russian"
    texto =  re.sub(r'[^\w\s]', '', texto) # removendo pontuação
    texto = re.sub(r'[\d]', '', texto) # removendo números

    texto = word_tokenize(texto)# tokenizando
    texto = [word.lower() for word in texto]# colocando tudo em minúsculo
    texto = [word for word in texto if word not in (stop)]    # removendo stopwords
    texto = [lemmatizer.lemmatize(word, pos="v") for word in texto] # lematizando

    return texto



In [52]:
normalizer("Here is text about an airline I like.")

['text', 'airline', 'like']

In [53]:
dataset["normalizer"] = dataset["Sentence"].apply(normalizer)
dataset.head()

Unnamed: 0,Sentence,Sentiment,normalizer
0,The GeoSolutions technology will leverage Bene...,positive,"[geosolutions, technology, leverage, benefon, ..."
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,"[esi, low, bk, real, possibility]"
2,"For the last quarter of 2010 , Componenta 's n...",positive,"[last, quarter, componenta, net, sales, double..."
3,According to the Finnish-Russian Chamber of Co...,neutral,"[accord, finnish, russian, chamber, commerce, ..."
4,The Swedish buyout firm has sold its remaining...,neutral,"[swedish, buyout, firm, sell, remain, percent,..."


In [54]:
nltk.download('averaged_perceptron_tagger')

# Identificar as partes do discurso (substantivos, verbos, adjetivos, etc.)

dataset['pos_tags'] = dataset['normalizer'].apply(lambda x:nltk.pos_tag(x))
dataset['pos_tags']


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pichau\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


0       [(geosolutions, NNS), (technology, NN), (lever...
1       [(esi, RB), (low, JJ), (bk, NN), (real, JJ), (...
2       [(last, JJ), (quarter, NN), (componenta, VBD),...
3       [(accord, NN), (finnish, JJ), (russian, JJ), (...
4       [(swedish, JJ), (buyout, IN), (firm, NN), (sel...
                              ...                        
5837    [(rise, NN), (cost, NN), (force, NN), (package...
5838    [(nordic, JJ), (walk, NN), (first, RB), (use, ...
5839    [(accord, NN), (ship, NN), (company, NN), (vik...
5840    [(build, VB), (home, NN), (improvement, NN), (...
5841    [(helsinki, NN), (afx, NN), (kci, NN), (konecr...
Name: pos_tags, Length: 5842, dtype: object

In [55]:
#VB: Verb, base form (verbo, forma base)
#VBD: Verb, past tense (verbo, passado)
#VBG: Verb, gerund or present participle (verbo, gerúndio ou particípio presente)
#VBN: Verb, past participle (verbo, particípio passado)
#VBP: Verb, non-3rd person singular present (verbo, presente não terceira pessoa singular)
#VBZ: Verb, 3rd person singular present (verbo, presente terceira pessoa singular)
#substantivos "NN"
#adjetivos "JJ"

dataset['only_vb'] = dataset['pos_tags'].apply(lambda x: [word[0] for word in x if word[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']])
dataset

Unnamed: 0,Sentence,Sentiment,normalizer,pos_tags,only_vb
0,The GeoSolutions technology will leverage Bene...,positive,"[geosolutions, technology, leverage, benefon, ...","[(geosolutions, NNS), (technology, NN), (lever...","[benefon, provide, platform]"
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,"[esi, low, bk, real, possibility]","[(esi, RB), (low, JJ), (bk, NN), (real, JJ), (...",[]
2,"For the last quarter of 2010 , Componenta 's n...",positive,"[last, quarter, componenta, net, sales, double...","[(last, JJ), (quarter, NN), (componenta, VBD),...",[componenta]
3,According to the Finnish-Russian Chamber of Co...,neutral,"[accord, finnish, russian, chamber, commerce, ...","[(accord, NN), (finnish, JJ), (russian, JJ), (...",[]
4,The Swedish buyout firm has sold its remaining...,neutral,"[swedish, buyout, firm, sell, remain, percent,...","[(swedish, JJ), (buyout, IN), (firm, NN), (sel...","[remain, take]"
...,...,...,...,...,...
5837,RISING costs have forced packaging producer Hu...,negative,"[rise, cost, force, package, producer, huhtama...","[(rise, NN), (cost, NN), (force, NN), (package...",[]
5838,Nordic Walking was first used as a summer trai...,neutral,"[nordic, walk, first, use, summer, train, meth...","[(nordic, JJ), (walk, NN), (first, RB), (use, ...",[use]
5839,"According shipping company Viking Line , the E...",neutral,"[accord, ship, company, viking, line, eu, deci...","[(accord, NN), (ship, NN), (company, NN), (vik...",[viking]
5840,"In the building and home improvement trade , s...",neutral,"[build, home, improvement, trade, sales, decre...","[(build, VB), (home, NN), (improvement, NN), (...","[build, decrease]"


In [56]:
cout = {}
obs = []
numLin = 0
for linha in dataset['only_vb']:
    for word in linha:
        if word in obs:
            cout[f'{word}']['cout'] += 1
        else:
            cout[f'{word}'] = {
                'cout': 1,
                'positive':0,
                'negative':0,
                'neutral':0
            }
            obs.append(word)
        
        
        if dataset.loc[numLin,'Sentiment'] == 'positive':
            cout[f'{word}']['positive'] += 1
        elif dataset.loc[numLin,'Sentiment'] == 'negative':
            cout[f'{word}']['negative'] += 1
        else:
            cout[f'{word}']['neutral'] += 1

    numLin+=1

cout = {chave: valor for chave, valor in sorted(cout.items(), key=lambda item: item[1]['cout'], reverse=True)}

cout
    

{'say': {'cout': 578, 'positive': 221, 'negative': 73, 'neutral': 284},
 'include': {'cout': 179, 'positive': 17, 'negative': 5, 'neutral': 157},
 'expect': {'cout': 138, 'positive': 53, 'negative': 12, 'neutral': 73},
 'make': {'cout': 130, 'positive': 31, 'negative': 17, 'neutral': 82},
 'fell': {'cout': 97, 'positive': 3, 'negative': 44, 'neutral': 50},
 'go': {'cout': 93, 'positive': 40, 'negative': 24, 'neutral': 29},
 'take': {'cout': 85, 'positive': 18, 'negative': 14, 'neutral': 53},
 'continue': {'cout': 83, 'positive': 30, 'negative': 14, 'neutral': 39},
 'buy': {'cout': 80, 'positive': 53, 'negative': 5, 'neutral': 22},
 'sell': {'cout': 80, 'positive': 24, 'negative': 15, 'neutral': 41},
 'eur': {'cout': 80, 'positive': 21, 'negative': 13, 'neutral': 46},
 'hold': {'cout': 78, 'positive': 17, 'negative': 4, 'neutral': 57},
 'finland': {'cout': 77, 'positive': 18, 'negative': 9, 'neutral': 50},
 'provide': {'cout': 72, 'positive': 20, 'negative': 4, 'neutral': 48},
 'develop

In [57]:
def sentiment2target(sentiment):
    return {
        'negative': 0,
        'neutral': 1,
        'positive' : 2
    }[sentiment]
dataset['Sentiment'] = dataset['Sentiment'].apply(lambda x:sentiment2target(x))
dataset['Sentiment']

0       2
1       0
2       2
3       1
4       1
       ..
5837    0
5838    1
5839    1
5840    1
5841    2
Name: Sentiment, Length: 5842, dtype: int64

In [58]:

from sklearn.model_selection import train_test_split

def com(lista):
    frase = ''
    for word in lista:
        frase += word
        frase += ' '
    return frase

dataset['normalizer'] = dataset['normalizer'].apply(lambda x: com(x))
print(dataset['normalizer'])
x_train, x_test, y_train, y_test = train_test_split(dataset['normalizer'], dataset['Sentiment'], test_size=0.4, random_state=0)

0       geosolutions technology leverage benefon gps s...
1                            esi low bk real possibility 
2       last quarter componenta net sales double eurm ...
3       accord finnish russian chamber commerce major ...
4       swedish buyout firm sell remain percent stake ...
                              ...                        
5837    rise cost force package producer huhtamaki axe...
5838    nordic walk first use summer train method cros...
5839    accord ship company viking line eu decision si...
5840    build home improvement trade sales decrease eu...
5841    helsinki afx kci konecranes say order four hot...
Name: normalizer, Length: 5842, dtype: object


In [59]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm

# Criação do vetorizador com n-gramas (ngram_range=(1,2)) e ajuste aos dados de treinamento
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
x_train_vectorized = count_vectorizer.fit_transform(x_train)

# Treina o modelo SVM
clf = OneVsRestClassifier(svm.SVC(gamma='scale', C=0.01, probability=True, class_weight='balanced', kernel='poly', degree= 2,coef0=54.9,  cache_size=1000))
clf_output = clf.fit(x_train_vectorized, y_train)

# Transforma o conjunto de teste usando o mesmo vetorizador ajustado
x_test_vectorized = count_vectorizer.transform(x_test)

# Calcula a precisão do modelo no conjunto de teste
accuracy = clf.score(x_test_vectorized, y_test)
print(f'Acurácia no conjunto de teste: {accuracy}')





Acurácia no conjunto de teste: 0.6893453145057766


In [60]:
def test(coef):
    clf = OneVsRestClassifier(svm.SVC(gamma='scale', C=0.01, probability=True, class_weight='balanced', kernel='poly', degree= 2,coef0=coef))
    clf_output = clf.fit(x_train_vectorized, y_train)
    accuracy = clf.score(x_test_vectorized, y_test)
    print(f'Acurácia no conjunto de teste: {accuracy} como o {coef}')