In [1]:
#! pip install -U pip setuptools wheel
#! pip install -U 'spacy[cuda12x]'
#! python -m spacy download en_core_web_trf
#! python -m spacy download en_core_web_sm
#! pip install pandas

### Baixar CSV DAtaset

Para realizar o tratamento de dados a seguir, o IMDB Dataset deve ser baixado, e colocado na pasta raiz do projeto

<a href="https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews" target="_blank">Conjunto de dados IMDB de 50 mil críticas de filmes</a>


In [2]:
import re
import pandas as pd
import spacy
from spacy.tokens import Doc
from tqdm import tqdm


In [3]:
# Carregar o modelo em inglês
# nlp = spacy.load("en_core_web_trf", exclude=["parser", "ner"])
nlp = spacy.load("en_core_web_sm", exclude=["parser", "ner"])
FILE_PATH = "D:/tcc2/guilherme/1webscrapper/data/reviews_scraper_original.csv"
df = pd.read_csv(FILE_PATH)
df = df[['review','rating']]
# df = df[:1000]

In [4]:
df.head()

Unnamed: 0,review,rating
0,This movie takes place off in fantasy land som...,1
1,"Too sentimental, too pathetic, too slow, too c...",1
2,There are some things in this life we will nev...,1
3,This movie was inspired by the brilliant Stir ...,1
4,"For the life of me, I can't understand all the...",1


In [5]:
df["rating"].value_counts()

rating
10    6236
8     6226
9     6223
7     5800
6     4651
5     3745
1     3455
4     2964
3     2731
2     2421
Name: count, dtype: int64

In [6]:
df.isnull().sum()

review    0
rating    0
dtype: int64

In [7]:
df.duplicated().sum()

16

In [8]:
# Remove linhas de reviews_treated duplicadoss
df = df.drop_duplicates()

In [9]:
df.duplicated().sum()

0

In [10]:
df["rating"].value_counts()

rating
10    6231
8     6226
9     6218
7     5799
6     4651
5     3745
1     3452
4     2962
3     2731
2     2421
Name: count, dtype: int64

In [11]:
def obter_maior_texto(df):
    return max(df["review"], key=lambda x: len(x.split()))

In [12]:
maior_string = obter_maior_texto(df)
tam_maior_string = len(maior_string)
print(tam_maior_string)

9942


In [13]:
# funcoes de tratamento comuns
def remove_extra_spaces(text):
    r"Remover espacos, quebra de linhas e tabulações do inicio e fim de frases e ubstituir sequências de espaços por um espaço"
    return re.sub(r"\s+", " ", text).strip()
    


def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile("<.*?>"), "", raw_text)
    return cleaned_text


def normalize_lowercase(text: str) -> str:
    return text.lower()


# funcoes de tratamento spacy
def lemmatize_text_remove_stopwords_and_puntuaction(doc: Doc | str):
    if not isinstance(doc, Doc):
        doc = nlp(doc)
    return " ".join(
        [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    )


def remove_stopwords_and_puntuaction(doc: Doc | str):
    if not isinstance(doc, Doc):
        doc = nlp(doc)
    return " ".join(
        [token.text for token in doc if not token.is_stop and not token.is_punct]
    )


# def remove_punctuation(doc):
#     return " ".join([token.text for token in doc if not token.is_punct])


# def remove_stopwords(doc):
#     return " ".join([token.text for token in doc if not token.is_stop])


# def lemmatize_text(doc):
#     return " ".join([token.lemma_ for token in doc])

In [16]:
def execute_all_trataments(series_review, FINAL_SPACY_FORMAT):
    reviews_treated = df["review"]

    reviews_treated = reviews_treated.apply(remove_tags)
    reviews_treated = reviews_treated.apply(normalize_lowercase)
    reviews_treated = reviews_treated.apply(remove_extra_spaces)

    len_texts = len(reviews_treated)
    docs = list(
        tqdm(
            nlp.pipe(
                reviews_treated,
                batch_size=500,
                disable=["parser", "ner"],
            ),
            colour="green",
            desc="Progresso: ",
            total=len_texts,
        )
    )

    if FINAL_SPACY_FORMAT == "lemma":
        reviews_treated = [
            lemmatize_text_remove_stopwords_and_puntuaction(doc) for doc in docs
        ]
    elif FINAL_SPACY_FORMAT == "text":
        reviews_treated = [remove_stopwords_and_puntuaction(doc) for doc in docs]
    reviews_treated = [remove_extra_spaces(review) for review in reviews_treated]
    return reviews_treated

In [17]:
# Aplicar as funções de str ao DataFrame
# definir como True para saída em forma de token.lemma, ou False para token.text
FINAL_SPACY_FORMAT= "lemma" 
reviews_treated= execute_all_trataments(df["review"],FINAL_SPACY_FORMAT)

Progresso: 100%|[32m██████████[0m| 44436/44436 [08:36<00:00, 85.97it/s] 


In [18]:
df["review"]= reviews_treated

In [19]:
df


Unnamed: 0,review,rating
0,movie take place fantasy land absolutely ridic...,1
1,sentimental pathetic slow conventionally voice...,1
2,thing life understand universe begin happen so...,1
3,movie inspire brilliant stir crazy star gene w...,1
4,life understand gush cornball sentimental phon...,1
...,...,...
44447,uma thurman return bride time look continue st...,10
44448,quentin tarantino kill bill vol 2 complete 2 v...,10
44449,vol 2 completely different tone feature film a...,10
44450,movie great mix different genre movie martial ...,10


In [20]:
maior_string = obter_maior_texto(df)
tam_maior_string= len(maior_string)
print(tam_maior_string)

6022


In [21]:
df.duplicated().sum()

13

In [22]:
df = df.drop_duplicates(ignore_index=True)
df.duplicated().sum()

0

In [23]:
print("value counts df['rating']:")
print(df["rating"].value_counts())

value counts df['rating']:
rating
10    6227
8     6224
9     6217
7     5799
6     4650
5     3744
1     3450
4     2961
3     2731
2     2420
Name: count, dtype: int64


In [24]:
DATA_PATH= f"D:/tcc2/guilherme/2tratamento_dados/dataset_tratado/Scraper_Dataset_tretead_{FINAL_SPACY_FORMAT}.csv"
df.to_csv(DATA_PATH, index=False)

In [None]:
print(df.groupby("rating"))

In [None]:
# # Iterar sobre cada categoria
# def balance_dataframe_groups(df, num_rows_per_rating):
#     # Lista para armazenar os DataFrames de cada categoria
#     df_list_rating = []
#     for rating, group in df.groupby("rating"):
#         rating_str = str(rating)  # Convertendo para string
#         num_samples = min(num_rows_per_rating[rating_str], len(group))
#         # Selecionar aleatoriamente o número correto de linhas para cada categoria
#         sampled_rows = group.sample(n=num_samples, random_state=42)
#         # Adicionar os dados selecionados à lista
#         df_list_rating.append(sampled_rows)
#     balanced_df =pd.concat(df_list_rating)
#     return balanced_df , df_list_rating

In [None]:
# # Defina o número de linhas desejado para cada categoria
# num_rows_per_rating = {
#     "1": 2000,
#     "2": 2000,
#     "3": 2000,
#     "4": 2000,
#     "5": 3744,
#     "6": 4256,
#     "7": 2000,
#     "8": 2000,
#     "9": 2000,
#     "10": 2000,
# }

# # Concatenar os DataFrames de cada categoria em um único DataFrame
# balanced_df, df_list_rating = balance_dataframe_groups(df, num_rows_per_rating)

# # # Mostrar as primeiras linhas do DataFrame balanceado
# # print(balanced_df.head())

# # Verificar se as categorias estão balanceadas
# print(balanced_df["rating"].value_counts())

In [None]:
# DATA_PATH="D:/tcc2/guilherme/2tratamento_dados/dataset_tratado/Scraper_Dataset_tretead_text_balanced_10classes.csv"
# balanced_df.to_csv(DATA_PATH, index=False)

In [None]:
# def categorize_rating(rating):
#     if int(rating) <= 4:
#         return 0
#     elif int(rating) <= 6:
#         return 1
#     else:
#         return 2

In [None]:
# balanced_df["rating"] = balanced_df["rating"].apply(categorize_rating)
# print(balanced_df["rating"].value_counts())

In [None]:
# DATA_PATH="D:/tcc2/guilherme/2tratamento_dados/dataset_tratado/Scraper_Dataset_tretead_text_balanced_3classes.csv"
# balanced_df.to_csv(DATA_PATH, index=False)