# EDA e Preprocessamento de Dados

## Configurações iniciais

In [225]:
# Importações de libraries importantes
import pandas as pd
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [226]:
# Downloads de packages do NLTK
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Carregamento do dataset
df = pd.read_csv("../data/raw/IMDB Dataset.csv")
df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kayky\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kayky\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Kayky\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [227]:
# Substituindo 'positive' e 'negative' para 1 e 0, respectivamente.
df["sentiment"] = np.where(df["sentiment"] == 'positive', 1, 0)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


## Funções Auxiliadoras

In [228]:
class Preprocessa:
    
    """
    Uma classe que contém métodos para pré-processar dados de texto.
    """

    def limpa_dados(self, string: str) -> str:
        """
        Limpa a string de entrada removendo caracteres não alfanuméricos e convertendo os
        caracteres restantes em minúsculas.

        Args:
            string (str): A string de entrada a ser limpa.

        Returns:
            str: A string limpa.

        Raises:
            TypeError: Se o tipo de dado de entrada não for uma string.
        """
        self.string = string

        if not isinstance(string, str):
            raise TypeError("O tipo do valor de entrada deveria ser string.")

        final_data = re.sub(r'[^a-zA-Z0-9\s]', '', string)
        final_data = final_data.lower()

        return final_data
    
    def remover_stopwords(self, df: pd.DataFrame, coluna: str) -> pd.DataFrame:
        
        """
        Remove as palavras irrelevantes (stopwords) da coluna especificada em um DataFrame.

        Args:
            df (pd.DataFrame): O DataFrame contendo os dados a serem pré-processados.
            coluna (str): A coluna do DataFrame que contém os dados a serem pré-processados.

        Returns:
            pd.DataFrame: O DataFrame com as stopwords removidas.

        """
        self.df = df
        self.coluna = coluna

        stopwords_ = set(stopwords.words('english'))

        df[coluna] = df[coluna].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_)]))

        return df
    
    def lematizar_texto(self, string: str) -> str:
        """
        Lematiza a string fornecida usando o lematizador WordNetLemmatizer da biblioteca NLTK.

        Args:
            string (str): A string a ser lematizada.

        Returns:
            str: A string lematizada.

        Raises:
            Nenhum.

        Exemplo:
            >>> texto = "Os gatos estão caçando ratos"
            >>> lematizador = Lematizador()
            >>> lematizador.lematizar_texto(texto)
            'O gato estar caçar rato '
        """

        self.string = string
        
        tokenizador = nltk.tokenize.WhitespaceTokenizer()
        lematizador = nltk.stem.WordNetLemmatizer()
        
        st = ""
        
        for word in tokenizador.tokenize(string):
            st = st + lematizador.lemmatize(word) + " "
        
        return st
        

In [229]:
# Limpa dados
df["review"] = df["review"].apply(lambda string: Preprocessa().limpa_dados(string))
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1
...,...,...
49995,i thought this movie did a down right good job...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,i am a catholic taught in parochial elementary...,0
49998,im going to have to disagree with the previous...,0


In [230]:
# Remove palavras desnecessárias do dataset
df = Preprocessa().remover_stopwords(df, "review")
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1
...,...,...
49995,thought movie right good job wasnt creative or...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary schools n...,0
49998,im going disagree previous comment side maltin...,0


In [231]:
df["review"] = df["review"].apply(lambda x: Preprocessa().lematizar_texto(x))
df

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically there family little boy jake think t...,0
4,petter matteis love time money visually stunni...,1
...,...,...
49995,thought movie right good job wasnt creative or...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary school nu...,0
49998,im going disagree previous comment side maltin...,0


## Train Test Split

In [232]:
texto = df["review"].to_numpy()
label = df["sentiment"].to_numpy()

In [233]:
texto

array(['one reviewer mentioned watching 1 oz episode youll hooked right exactly happened mebr br first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordbr br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death stare dodgy dealing shady agreement never far awaybr br would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate

In [234]:
label

array([1, 1, 1, ..., 0, 0, 0])

In [235]:
X_train, X_test, Y_train, Y_test = train_test_split(texto, label, test_size=0.15)

In [211]:
print(f"O formato do dataset X_train é {X_train.shape}")
print(f"O formato do dataset X_test é {X_test.shape}")
print(f"O formato do dataset Y_train é {Y_train.shape}")
print(f"O formato do dataset Y_test é {Y_test.shape}")

O formato do dataset X_train é (42500,)
O formato do dataset X_test é (7500,)
O formato do dataset Y_train é (42500,)
O formato do dataset Y_test é (7500,)


In [236]:
pickle.dump(X_train, open('../data/processed/X_train_deploy.pkl', 'wb'))

## Tokenização

In [212]:
# Define variáveis como tamanho do vocabulário e seta o Tokenizer
vocabulario_tamanho = 13000
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words=vocabulario_tamanho, oov_token=oov_tok)

In [213]:
tokenizer.fit_on_texts(X_train)

In [214]:
tokenizer.word_counts

OrderedDict([('rarely', 498),
             ('shut', 275),
             ('movie', 83684),
             ('first', 14546),
             ('10', 3576),
             ('minute', 5926),
             ('one', 45051),
             ('turned', 1531),
             ('obvious', 1682),
             ('purpose', 834),
             ('expose', 144),
             ('much', 16114),
             ('skin', 353),
             ('many', 11404),
             ('b', 1096),
             ('actress', 2433),
             ('possible', 1605),
             ('nothing', 6967),
             ('else', 3134),
             ('really', 19490),
             ('mattersbr', 14),
             ('br', 96328),
             ('dont', 14338),
             ('get', 20783),
             ('wrong', 2882),
             ('like', 33807),
             ('pretty', 6155),
             ('sex', 2756),
             ('scene', 17460),
             ('sexploitation', 27),
             ('scale', 327),
             ('merit', 318),
             ('director', 7860),
 

### Tokenização do dataset Train

In [215]:
train_sequencia = tokenizer.texts_to_sequences(X_train)

In [216]:
train_sequencia

[[1582,
  2611,
  3,
  24,
  207,
  100,
  5,
  547,
  496,
  992,
  3,
  4313,
  19,
  2121,
  37,
  772,
  326,
  519,
  78,
  246,
  16,
  1,
  2,
  25,
  11,
  268,
  6,
  91,
  326,
  284,
  17,
  1,
  3,
  2254,
  2317,
  65,
  78,
  246,
  5014,
  2,
  319,
  52,
  17,
  38,
  537,
  172,
  5,
  69,
  5,
  4288,
  172,
  1337,
  1774,
  6385,
  563,
  1904,
  24,
  266,
  424,
  260,
  3282,
  118,
  124,
  5,
  1,
  47,
  1374,
  599,
  239,
  1,
  4714,
  12192,
  2,
  18,
  137,
  1710,
  51,
  24,
  207,
  100,
  453,
  34,
  1,
  5289,
  1967,
  162,
  59,
  51],
 [1045,
  106,
  113,
  550,
  1011,
  1127,
  102,
  507,
  10,
  1204,
  662,
  904,
  115,
  3,
  21,
  12,
  41,
  277,
  8,
  5,
  479,
  3,
  540,
  1,
  2,
  41,
  21,
  910,
  12,
  3915,
  243,
  45,
  868,
  23,
  1469,
  4,
  1361,
  912,
  67,
  15,
  198,
  474,
  637,
  1,
  1678,
  198,
  1968,
  165,
  6599,
  2,
  4,
  31,
  577,
  165,
  826,
  198,
  1255,
  566,
  225,
  145,
  454,
  152,
  666

In [217]:
# Padding para adequar cada sequência a um tamanho padrão de 250 tokens.
tamanho_frases = 250
train_padding = pad_sequences(train_sequencia, maxlen=tamanho_frases, padding='post', truncating='post')

In [218]:
train_padding

array([[ 1582,  2611,     3, ...,     0,     0,     0],
       [ 1045,   106,   113, ...,     0,     0,     0],
       [ 1375,   173,    38, ...,     0,     0,     0],
       ...,
       [ 1374,   184,    13, ...,     0,     0,     0],
       [  359,   336,  3798, ...,  3588,  8856,  6632],
       [10751,    40,  7412, ...,     0,     0,     0]])

### Tokenização do dataset Test

In [219]:
test_sequencia = tokenizer.texts_to_sequences(X_test)

In [220]:
test_sequencia

[[317,
  717,
  217,
  90,
  893,
  12237,
  2,
  30,
  24,
  7,
  65,
  31,
  135,
  51,
  186,
  838,
  93,
  1740,
  6,
  208,
  1,
  29,
  4093,
  47,
  701,
  296,
  1385,
  7309,
  1,
  1,
  39,
  4006,
  2084,
  223,
  254,
  1521,
  39,
  863,
  290,
  224,
  1211,
  1,
  439,
  186,
  1,
  12,
  23,
  16,
  160,
  572,
  333,
  242,
  527,
  96,
  671,
  756,
  260,
  56,
  15,
  6129,
  1,
  17,
  1194,
  280,
  780,
  307,
  543,
  13,
  854,
  163,
  854,
  379,
  7,
  847,
  42,
  3530,
  1218,
  261,
  16,
  23,
  772,
  4,
  137,
  56,
  527,
  12,
  5,
  235,
  1837,
  476,
  327,
  7],
 [231,
  98,
  4546,
  4,
  732,
  24,
  6099,
  724,
  1,
  1,
  11410,
  1,
  206,
  452,
  2466,
  29,
  6019,
  49,
  819,
  110,
  54,
  59,
  558,
  4,
  718,
  24,
  53,
  603,
  1,
  64,
  617,
  5560,
  98,
  266,
  213,
  54,
  254,
  1691,
  732,
  1,
  1,
  1677,
  1,
  5537,
  732,
  22,
  1232,
  1,
  305,
  848,
  1022,
  10606,
  1,
  375,
  96,
  424,
  1402,
  264,
  1,

In [221]:
test_padding = pad_sequences(test_sequencia, maxlen=tamanho_frases, padding='post', truncating='post')

In [222]:
test_padding

array([[ 317,  717,  217, ...,    0,    0,    0],
       [ 231,   98, 4546, ...,    0,    0,    0],
       [ 309,  120, 1861, ...,    0,    0,    0],
       ...,
       [   1, 1888, 5488, ...,    0,    0,    0],
       [  21,  789,    3, ...,    0,    0,    0],
       [ 194,  342,   15, ...,    0,    0,    0]])

## Serialização dos datasets preprocessados 

In [224]:
# Salva os arrays em formato pickle
pickle.dump(train_padding, open('../data/processed/X_train.pkl', 'wb'))
pickle.dump(test_padding, open('../data/processed/X_test.pkl', 'wb'))
pickle.dump(Y_train, open('../data/processed/Y_train.pkl', 'wb'))
pickle.dump(Y_test, open('../data/processed/Y_test.pkl', 'wb'))