In [5]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.tokenize import regexp_tokenize
import os
import joblib
import numpy as np
import spacy
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [6]:
# Downloads necessários do NLTK
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Configurações gerais para visualização
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [7]:
# Carregar o modelo do spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Stopwords
sw = set(stopwords.words("english"))

# 1. CARREGAR O DATASET COMPLETO

In [None]:
chunksize = 100000  # Processar em lotes de 100.000 linhas
data_chunks = []
for chunk in pd.read_csv('complaints.csv', chunksize=chunksize):
    chunk = chunk.dropna()
    data_chunks.append(chunk)
data = pd.concat(data_chunks, ignore_index=True)

print("\nTotal de linhas no dataset completo:", len(data))
print("Distribuição inicial das classes:")
print(data["product_5"].value_counts())

# 2. PRÉ-PROCESSAMENTO (Lemmatization)

In [None]:
REGEX_PATTERN = r'''(?x)  # Retirado do Notebook do Professor "preprocessing.ipynb"
    (?:[A-Z]\.)+         
    | (?!x+\b)\w+(?:-\w+)*  # Exclui palavras só com "x"
    | \$?\d+(?:\.\d+)?%? 
    | \.\.\.             
    | [][.,;"'?():-_`]  
'''

In [None]:
def preprocess_lemmatization(text):
    text = str(text) if pd.notna(text) else ''
    tokens = regexp_tokenize(text.lower(), REGEX_PATTERN)
    doc = nlp(' '.join(tokens))
    tokens = [token.lemma_ for token in doc if token.text not in sw and token.is_alpha]
    return ' '.join(tokens)

In [None]:
from tqdm import tqdm  # For progress tracking

file_name = "consumer_complaints_full_lem.csv"
column_name = "processed_text_lem"
chunksize = 100000

corpus = []
for i, chunk in enumerate(pd.read_csv('complaints.csv', chunksize=chunksize)):
    
    # Use .loc to avoid SettingWithCopyWarning and add progress bar
    chunk.loc[:, column_name] = [
        preprocess_lemmatization(review) 
        for review in tqdm(chunk["narrative"], desc=f"Processing chunk {i+1}")
    ]
    
    corpus.append(chunk)
    print(f"Processed chunk {i+1}...")

# Concatenate all chunks into a single DataFrame
data_processed = pd.concat(corpus, ignore_index=True)

# Save to CSV
data_processed.to_csv(file_name, index=False)
print(f"Corpus completo saved as '{file_name}'.")

Processing chunk 1: 100%|██████████| 96979/96979 [18:21<00:00, 88.00it/s]  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 1...


Processing chunk 2: 100%|██████████| 95151/95151 [15:40<00:00, 101.13it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 2...


Processing chunk 3: 100%|██████████| 94205/94205 [15:06<00:00, 103.89it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 3...


Processing chunk 4: 100%|██████████| 92633/92633 [14:44<00:00, 104.76it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 4...


Processing chunk 5:  53%|█████▎    | 49268/93535 [07:59<08:31, 86.59it/s] 

# 3. VETORIZAÇÃO (TF-IDF com N-gramas (1,3))

In [None]:
print("Vetorizando o dataset completo com TF-IDF...")
tfidf = TfidfVectorizer(max_features=4000, ngram_range=(1, 3), stop_words=list(sw))
X = tfidf.fit_transform(data_processed[column_name])
y = data_processed["product_5"]
joblib.dump(tfidf, 'tfidf_vectorizer_full.joblib')
print(f"Tamanho do espaço de features: {X.shape}")

In [None]:
# Visualização com PCA antes do SMOTE
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())  # Pode ser necessário amostrar se for muito grande
df_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
df_pca["Class"] = y.values

plt.figure()
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_pca, alpha=0.6)
plt.title("Distribuição das Classes Antes do SMOTE (Dataset Completo)")
plt.legend()
plt.tight_layout()
plt.savefig("smote_before_full_pca.png")
plt.show()

# 4. OVERSAMPLING COM SMOTE APÓS O TF-IDF

In [None]:
smote = SMOTE(random_state=42, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X, y)

print("\nDistribuição das classes após SMOTE:")
print(pd.Series(y_smote).value_counts())

In [None]:
# Visualização com PCA após o SMOTE
X_smote_pca = pca.transform(X_smote.toarray())
df_smote_pca = pd.DataFrame(X_smote_pca, columns=["PC1", "PC2"])
df_smote_pca["Class"] = y_smote

plt.figure()
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_smote_pca, alpha=0.6)
plt.title("Distribuição das Classes Após o SMOTE (Dataset Completo)")
plt.legend()
plt.tight_layout()
plt.savefig("smote_after_full_pca.png")
plt.show()

# 5. TREINAMENTO E AVALIAÇÃO COM Logistic Regression

In [None]:
def train_and_evaluate_lr(X, y, scenario_name):    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Modelo LogisticRegression com L2
    lr_l2 = LogisticRegression(penalty='l2', max_iter=1000, solver='liblinear', random_state=42)
    lr_l2.fit(X_train, y_train)
    y_pred_enc = lr_l2.predict(X_test)
    
    print(f"Logistic Regression ({scenario_name}):\n", classification_report(y_test, y_pred_enc, zero_division=0))
    joblib.dump(lr_l2, f'lr_l2_{scenario_name.lower().replace(" ", "_")}_3percent.joblib')

train_and_evaluate_lr(X_smote, y_smote, "SMOTE Oversampling Full Dataset")