In [17]:
from helpers import (
    preprocessing
)
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, DateType, FloatType, ArrayType, LongType, MapType
import warnings
import numpy as np 
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd
warnings.filterwarnings('ignore')

In [2]:
# base de dados do twitter ja classificada com sentimentos
path = "/home/daholive/Documents/twitter_ellection_brazil_v2/datasource/raw_kaggle/NoThemeTweets.csv"

In [19]:
# dataframe twitter com sentimentos classificados
df = pd.read_csv(path, error_bad_lines=False, sep=',')

In [25]:
def get_treated_data(
    dataset, 
    cols, 
    cols_drop = [], 
    col_to_change='sentiment', 
    val_col_change = {"Negativo": 0, "Positivo":1}
):
 
    # 2. Rename columns
    dataset.columns = cols
    
    # 3. Drop columns not needed
    dataset.drop(cols_drop, axis=1, inplace=True)
    
    # 3.1 Drop all rows with at least one element is missing
    dataset.dropna()
    
    # 4. Convert setiments from "Negative/Positive" to "0/1" 
    dataset.replace({col_to_change: val_col_change}, inplace=True)
    
    # Return our dataset
    return dataset

In [26]:
default_cols = ["id", "text", "date", "sentiment", "query"]

default_drop_cols = ["id", "date", "query"]

# class_names = ['Negativo', 'Positive']
df = get_treated_data(df, default_cols, cols_drop = default_drop_cols)

In [28]:
# Take a peek at the dataset
df["sentiment"].value_counts(normalize=True)

Negativo    0.665179
Positivo    0.334821
Name: sentiment, dtype: float64

In [29]:
def preprocess_text(text):
    
    # Not needed to be imported globally
    from bs4 import BeautifulSoup
    import re
    text = BeautifulSoup(text, "lxml").get_text()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # Remove urls
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-Z.!?']", ' ', text)
    text = re.sub(r" +", ' ', text)
    
    return text

In [30]:
df['text'] = df['text'].apply(lambda text: preprocess_text(text))

In [32]:
df.head()

Unnamed: 0,text,sentiment
0,para eu ir,Positivo
1,O meu like eu j dei na poca,Positivo
2,Eu s queria conseguir comer alguma coisa pra p...,Positivo
3,D que lindo dia !,Positivo
4,Resmungao Pq da pr jeito!! uma oferta ha q ap...,Positivo


In [33]:
print("The number of rows and columns in the dataset is: {}".format(df.shape))

The number of rows and columns in the dataset is: (785814, 2)


In [34]:
# Identify missing values
df.apply(lambda x: sum(x.isnull()), axis=0)

text         0
sentiment    0
dtype: int64

In [35]:
# Check the target class balance
df["sentiment"].value_counts(normalize=True)

Negativo    0.665179
Positivo    0.334821
Name: sentiment, dtype: float64