In [1]:
import re
import pandas as pd
import unicodedata
import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Classifiers
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

nlp = spacy.load("pt_core_news_sm")

In [2]:
df = pd.read_csv('Tweets_Mg.csv', header=None)
df.columns = ['texto', 'sentimento']
df.head()

Unnamed: 0,texto,sentimento
0,���⛪ @ Catedral de Santo Antônio - Governador ...,Neutro
1,"� @ Governador Valadares, Minas Gerais https:/...",Neutro
2,"�� @ Governador Valadares, Minas Gerais https:...",Neutro
3,��� https://t.co/BnDsO34qK0,Neutro
4,��� PSOL vai questionar aumento de vereadores ...,Negativo


In [3]:
def preprocess(text):
    
    text = ' '.join([word for word in text.split() if word not in STOP_WORDS])
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
    text = re.sub(r'[?!.,@#:]', '', text)
    special_char_pattern = re.compile(r'[\}\}\\\(\)\./!-]')
    text = special_char_pattern.sub(" ", text)  
    
    return text.strip()

In [4]:
df['texto'] = df['texto'].apply(preprocess)
df['sentimento'] = pd.factorize(df.sentimento)[0]
df.head()

Unnamed: 0,texto,sentimento
0,Catedral Santo Antonio Governador Valadares ...,0
1,Governador Valadares Minas Gerais https tco B...,0
2,Governador Valadares Minas Gerais https tco d...,0
3,https tco BnDsO34qK0,0
4,PSOL questionar aumento vereadores e prefeito ...,1


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['texto'], df['sentimento'], test_size=0.3,random_state=42)

In [6]:
def classifications(X_train, X_test, y_train, y_test):
    
    vectorize =  TfidfVectorizer()
    
    train_features = vectorize.fit_transform(X_train)
    test_features = vectorize.transform(X_test)
    
    clf_svc = SVC(kernel='linear').fit(train_features, y_train)
    predict_svc = clf_svc.predict(test_features)
    result_svc = classification_report(y_test, predict_svc)
    
    clf_sgd = SGDClassifier().fit(train_features, y_train)
    predict_sgd = clf_sgd.predict(test_features)
    result_sgd = classification_report(y_test, predict_sgd)
    
    clf_nb = MultinomialNB().fit(train_features, y_train)
    predict_nb = clf_nb.predict(test_features)
    result_nb = classification_report(y_test, predict_nb)
    
    clf_gb = GradientBoostingClassifier().fit(train_features, y_train)
    predict_gb = clf_gb.predict(test_features)
    result_gb = classification_report(y_test, predict_gb)
    
    clf_rf = RandomForestClassifier().fit(train_features, y_train)
    predict_rf = clf_rf.predict(test_features)
    result_rf = classification_report(y_test, predict_rf)
        
    clf_mlp = MLPClassifier().fit(train_features, y_train)
    predict_mlp = clf_mlp.predict(test_features)
    result_mlp = classification_report(y_test, predict_mlp)
    
    clf_lr = LogisticRegression().fit(train_features, y_train)
    predict_lr = clf_lr.predict(test_features)
    result_lr = classification_report(y_test, predict_lr)    
    
    print('--> SVC\n')
    print(result_svc)
    print('\n\n')

    print('--> SGD\n')
    print(result_sgd)
    print('\n\n')

    print('--> Naive Bayers\n')      
    print(result_nb)
    print('\n\n')
    
    print('--> Gradient Boosting\n')
    print(result_gb)
    print('\n\n')
    
    print('--> Random Forest\n')
    print(result_rf)
    print('\n\n')
    
    print('--> Multilayer Perceptron\n')
    print(result_mlp)
    print('\n\n')
    
    print('--> Logistic Regression\n')
    print(result_lr)    
    

In [7]:
classifications(X_train, X_test, y_train, y_test)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


--> SVC

             precision    recall  f1-score   support

          0       0.94      0.95      0.95       758
          1       0.97      0.96      0.97       722
          2       0.98      0.98      0.98       980

avg / total       0.96      0.96      0.96      2460




--> SGD

             precision    recall  f1-score   support

          0       0.95      0.94      0.95       758
          1       0.97      0.97      0.97       722
          2       0.97      0.98      0.98       980

avg / total       0.96      0.96      0.96      2460




--> Naive Bayers

             precision    recall  f1-score   support

          0       0.96      0.89      0.92       758
          1       0.95      0.97      0.96       722
          2       0.94      0.98      0.96       980

avg / total       0.95      0.95      0.95      2460




--> Gradient Boosting

             precision    recall  f1-score   support

          0       0.90      0.90      0.90       758
          1       0.9