In [1]:
import numpy as np 
import unidecode
import pandas as pd 
import re
import nltk 
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from pycaret.classification import *
from nltk.stem.rslp import RSLPStemmer
from helpers import (
    clean_text
)
import warnings
warnings.filterwarnings('ignore')

lemmatizer = RSLPStemmer()

In [2]:
dataframe = pd.read_csv("/home/daholive/Documents/twitter_ellection_brazil/datasource/TweetsWithTheme.csv", sep=",")

In [3]:
dataframe["sentiment"] = dataframe["sentiment"].map({
    "Negativo": 0,
    "Positivo": 1    
})

In [4]:
stop_words_br = list(set(stopwords.words("portuguese")))
stop_words_br_no_accent = [unidecode.unidecode(word) for word in list(set(stopwords.words("portuguese")))]

In [5]:
def clean_text(texto):
    
    punct = string.punctuation # Cria uma tabela de tradução
    trantab = str.maketrans(punct, len(punct)*' ') # Todo simbolo da pontuação e substituido por um espaço
    
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030""]+", re.UNICODE)
    
    texto = re.sub('\d+', '', str(texto)).replace("&gt;"," ").replace("&lt;"," ") 
    texto = re.sub(r"https?:\/\/\S+","", texto)
    texto = re.sub(r"@[A-Za-z0-9\w]+","", texto)
    texto = re.sub(r"#[A-Za-z0-9\w]+","", texto)
    texto = re.sub('^RT ',' ',texto)
    texto = texto.translate(trantab).replace("\n"," ")
    texto = re.sub(emoj, '', texto).replace("“"," ").replace("”"," ").strip().lower()
    texto = unidecode.unidecode(texto)
    texto = ' '.join([word for word in texto.split() if word not in stop_words_br_no_accent])
    texto = ' '.join([word for word in texto.split() if word.isalnum()])
    # texto = ' '.join([lemmatizer.stem(word) for word in texto.split()])
    
    return " ".join(texto.split())

In [6]:
features = dataframe["tweet_text"].map(clean_text).values
labels = dataframe["sentiment"].values

In [7]:
"""
max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
max_df = 25 means "ignore terms that appear in more than 25 documents".
The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

min_df is used for removing terms that appear too infrequently. For example:

min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
min_df = 5 means "ignore terms that appear in less than 5 documents".
The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.
"""
#max_features=3000,
"""
vectorizer = TfidfVectorizer(
    min_df=0.004,
    max_df=0.7
)
"""
vectorizer = TfidfVectorizer(
    min_df=10,
    max_df=0.8,
    ngram_range=(1,2),
    analyzer='word'
).fit(features)
processed_features = vectorizer.fit_transform(features)


In [8]:
df_tfidf = pd.DataFrame(
    processed_features.toarray(), 
    columns=vectorizer.get_feature_names(), 
    index=dataframe.index)

In [9]:
df_setup = pd.concat(
    [dataframe["sentiment"], df_tfidf], 
    axis = 1)

In [None]:
pce_1 = setup(
    data = df_setup, 
    target = 'sentiment', 
    session_id = 5, 
    train_size = 0.80,
    fold=10,
    transformation = True,
    use_gpu = True,
    verbose=True)

IntProgress(value=0, description='Processing: ', max=3)

In [None]:
best = compare_models(
    fold=5,
    sort='F1')  
#  0.7637
# include=['gbc','lda','ridge','ada','lr','lightgbm'],

In [None]:
best

"""
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=5, solver='auto',
                tol=0.001)
"""

In [None]:
tuned_dt = tune_model(best, fold=5)

In [None]:
plot_model(tuned_dt, plot = 'auc')

In [None]:
plot_model(tuned_dt, plot = 'pr')

In [None]:
plot_model(tuned_dt, plot='feature')

In [None]:
plot_model(tuned_dt, plot = 'confusion_matrix')

In [None]:
predict_model(tuned_dt);

In [None]:
final_rf = finalize_model(tuned_dt)

In [None]:
save_model(final_rf,'Final Linear Discriminant Analysis')

In [None]:
saved_final_rf = load_model('Final Linear Discriminant Analysis')

In [None]:
saved_final_rf

In [None]:
texto = ["Bolsonaro você é um idiota e vai perder a eleição","Lula é o melhor, e ganha no primeiro turno"]

features = [clean_text(w) for w in texto]
features

In [None]:
vectorizer = TfidfVectorizer(
    min_df=0.004,
    max_df=0.7
)

In [None]:
processed_features = vectorizer.fit_transform(features)

In [None]:
vectorizer.get_feature_names()

In [None]:
df_tfidf_v2 = pd.DataFrame(
    processed_features.toarray(), 
    columns=vectorizer.get_feature_names())

df_tfidf_v2

In [None]:

unseen_predictions = predict_model(tuned_dt, data=df_tfidf_v2)
