In [None]:
import unidecode
import re
import numpy as np
import nltk 
import string
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, IntegerType
import pyspark.sql.functions as F

# model tools
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from nltk.stem.rslp import RSLPStemmer

# plot
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

stemmer = RSLPStemmer()

In [None]:
def text_preprocessing(instancia):

    punct = string.punctuation
    trantab = str.maketrans(punct, len(punct)*' ')
    
    instancia = instancia.lower()
    instancia = re.sub('\d+', '', str(instancia)).replace("&gt;"," ").replace("&lt;"," ") 
    instancia = re.sub(r"https?:\/\/\S+","", instancia)
    instancia = re.sub(r"@[A-Za-z0-9\w]+","", instancia)
    instancia = re.sub(r"#[A-Za-z0-9\w]+","", instancia)
    instancia = re.sub('^RT ',' ',instancia)
    instancia = re.sub(r"http\S+", "", instancia) 
    instancia = re.sub(r'([A-Za-z])\1{2,}', r'\1', instancia)
    instancia = instancia.translate(trantab).replace("\n"," ")
    instancia = unidecode.unidecode(instancia)

    # #Lista de  stopwords no idioma portugues
    stopwords = [unidecode.unidecode(w) for w in list(set(nltk.corpus.stopwords.words('portuguese')))]

    # #guarda no objeto palavras
    palavras = [i for i in instancia.split() if not i in stopwords]
    
    palavras = [re.sub(r'(ha)\1+', r'\1',word) for word in palavras]
    palavras = [re.sub(r'(uha)\1+', r'\1',word) for word in palavras]
    palavras = [stemmer.stem(word) for word in palavras]

    palavras = " ".join(palavras) \
        .strip() \
        .replace('"','') \
        .replace('.','') \
        .replace('-','') \
        .replace('_','') \
        .replace('*','') \
        .replace('>','') \
        .replace('<','') \
        .replace('!','') \
        .replace('?','') \
        .replace('[','') \
        .replace(']','') \
        .replace('\'','') \
        .replace('rt ','')

    return "-" if palavras.strip()=="" else palavras.strip()

In [None]:
# SPARK INSTANCE
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.executor.memory","4G") \
    .config("spark.driver.memory","4G") \
    .config("spark.executor.cores","12") \
    .getOrCreate()

In [None]:
# DADOS
dataframe = spark.read.options(
    delimiter=';',
    header='True').csv("/home/daholive/Documents/twitter_ellection_brazil/datasource/TweetsWithTheme_v2.csv")

In [None]:
###############################################################################
## PREPROCESSING WITH SPARK
###############################################################################
from pyspark.sql.types import StringType, IntegerType

dataframe = dataframe.withColumn("sentiment_map", F.when(F.col("sentiment")=="Negativo", 0).otherwise(1))

rdd2 = dataframe.rdd.map(lambda x: (text_preprocessing(x.tweet_text),x.sentiment_map))

schema = StructType([       
    StructField('features', StringType(), True),
    StructField('label', StringType(), True),
])

# create metadata dataframe
df_features = spark.createDataFrame(rdd2, schema = schema)

count_map = F.udf( 
    lambda x: len(x.split()),
    IntegerType()     
)

df_features = df_features \
    .filter(F.col("features")!="-") \
    .dropDuplicates(subset = ['features'])

In [None]:
feature_counts = df_features.select( 
    count_map( F.col('features') ).alias('features_count') 
).toPandas()['features_count'].values

In [None]:
plt.rcParams['figure.figsize'] = [10, 8]
ax = sns.boxplot(x=feature_counts)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [10, 8]
ax2 = sns.histplot(x=feature_counts, kde=True)
plt.show()

In [None]:
df_features = df_features \
    .filter( count_map(F.col("features"))<30 ) 

In [None]:
new_feature_counts = df_features.select( count_map( F.col('features') ).alias('features_count') ).toPandas()['features_count'].values

In [None]:
plt.rcParams['figure.figsize'] = [10, 8]
ax = sns.boxplot(x=new_feature_counts)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [10, 8]
ax2 = sns.histplot(x=new_feature_counts, kde=True)
plt.show()

In [None]:
df_features.groupBy('label').count().show()

In [None]:
# dataset balanceado
train = df_features.sampleBy("label", fractions={'0': 1, '1': 0.87}, seed=10)
train.groupBy('label').count().show()

In [None]:
###############################################################################
## FEATURE AND LABEL DEFINITION
###############################################################################
features = train.select('features').rdd.flatMap(lambda x: x).collect()

labels = train.select('label').rdd.flatMap(lambda x: x).collect()

In [None]:
###############################################################################
## TFIDF
###############################################################################
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Bigram Counts
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),  min_df=2, max_df=0.95, max_features=5000)
bigram_vectorizer.fit(features)
X_train_bigram = bigram_vectorizer.transform(features)

# Bigram Tf-Idf
bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)

In [None]:
###############################################################################
## MODEL TESTS  LogisticRegression
###############################################################################
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import RandomizedSearchCV

def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.7, random_state=42
    )

    clf = LogisticRegression(
        class_weight='balanced'
    )
    
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l2']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    max_iter = [10000]
    grid = dict(solver=solvers,penalty=penalty,C=c_values,max_iter=max_iter)
    
    kfold = StratifiedKFold(n_splits = 10, shuffle = True)

    random_search_cv = RandomizedSearchCV(
        estimator=clf,
        param_distributions=grid,
        n_jobs=-1, 
        cv=kfold,
        n_iter=20
    )

    random_search_cv.fit(X_train, y_train)
    y_pred_train = random_search_cv.predict(X_train)
    y_pred_valid = random_search_cv.predict(X_valid)

    print(classification_report_imbalanced(y_train, y_pred_train))
    print(classification_report_imbalanced(y_valid, y_pred_valid))

    return { 
        'y_train':y_train,
        'y_pred_train':y_pred_train,
        'y_valid':y_valid,
        'y_pred_valid': y_pred_valid
    }

y_train = labels

data_return = train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

### CONFUSION MATRIX - LogisticRegression

In [None]:
from pretty_confusion_matrix import pp_matrix_from_data

In [None]:
pp_matrix_from_data(data_return['y_train'], data_return['y_pred_train'],columns=['negative','positive'])

In [None]:
pp_matrix_from_data(data_return['y_valid'], data_return['y_pred_valid'],columns=['negative','positive'])

In [None]:
###############################################################################
## MODEL TESTS  MultinomialNB
###############################################################################
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import RandomizedSearchCV

def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.7, random_state=42
    )

    clf = MultinomialNB()

    alpha = [100, 10, 1.0, 0.1, 0.01]
    grid = dict(alpha=alpha)
    
    kfold = StratifiedKFold(n_splits = 10, shuffle = True)

    random_search_cv = RandomizedSearchCV(
        estimator=clf,
        param_distributions=grid,
        n_jobs=-1, 
        cv=kfold,
        n_iter=20
    )

    random_search_cv.fit(X_train, y_train)
    y_pred_train = random_search_cv.predict(X_train)
    y_pred_valid = random_search_cv.predict(X_valid)

    print(classification_report_imbalanced(y_train, y_pred_train))
    print(classification_report_imbalanced(y_valid, y_pred_valid))

y_train = labels

train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

### APLICAÇÃO DO TENSORFLOW E REDES NEURAIS

In [None]:
# Tensorflow and Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

In [None]:

def tensorflow_preprocessing_test_train_split(features, labels, max_words, max_len):
    
    labels_tf = tf.keras.utils.to_categorical(labels, 2, dtype="float32")
    
    max_words = max_words
    max_len = max_len
    
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(features)
    sequences = tokenizer.texts_to_sequences(features)
    tweets = pad_sequences(sequences, maxlen=max_len)

    #Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(
        tweets,
        labels_tf,
        random_state=42)

    return X_train, X_test, y_train, y_test


def tensorflow_model_fit(max_words, max_len, X_train, y_train, X_test, y_test):

    # Bidirectional LTSM model
    model = Sequential()
    model.add(tf.keras.layers.Embedding(max_words, 64, input_length=max_len))
    model.add(tf.keras.layers.Bidirectional(layers.LSTM(20,dropout=0.2)))
    model.add(tf.keras.layers.Dense(2,activation='softmax'))
    model.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['acc'])

    model.summary()

    checkpoint = ModelCheckpoint(
        filepath="models/best_model.hdf5", 
        monitor='val_acc', 
        verbose=2,
        save_best_only=True, 
        mode='auto')
    
    history = model.fit(
        X_train, 
        y_train, 
        epochs=10,
        validation_data=(X_test, y_test),
        callbacks=[checkpoint])
    
    return history, model


def check_model(model):

    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns

    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
    print('Model accuracy: ',test_acc)
    
    predictions = model.predict(X_test)
    
    # Confusion matrix
    matrix = confusion_matrix(y_test.argmax(axis=1), np.around(predictions, decimals=0).argmax(axis=1))

    conf_matrix = pd.DataFrame(matrix, index = [0,1],columns = [0,1])
    #Normalizing
    conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize = (15,15))
    sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 15})


def tensorflow_model_test(best_model, tokenizer, token, sentiment_list, max_len):

    tokenizer.fit_on_texts(token)

    sequence = tokenizer.texts_to_sequences([token])
    
    test = pad_sequences(sequence, maxlen=max_len)
    
    print(sentiment_list[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]])