In [None]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.tokenize import regexp_tokenize
import os
import joblib
import numpy as np
import spacy
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

In [None]:
# Configurações gerais para visualização
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Carregar o modelo do spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Stopwords
sw = set(stopwords.words("english"))

In [None]:
REGEX_PATTERN = r'''(?x)  # Retirado do Notebook do Professor "preprocessing.ipynb"
    (?:[A-Z]\.)+         
    | (?!x+\b)\w+(?:-\w+)*  # Exclui palavras só com "x"
    | \$?\d+(?:\.\d+)?%? 
    | \.\.\.             
    | [][.,;"'?():-_`]  
'''

In [None]:
def preprocess_lemmatization(text):
    text = str(text) if pd.notna(text) else ''
    tokens = regexp_tokenize(text.lower(), REGEX_PATTERN)
    doc = nlp(' '.join(tokens))
    tokens = [token.lemma_ for token in doc if token.text not in sw and token.is_alpha]
    return ' '.join(tokens)

In [None]:
# Parâmetros
chunksize = 100000
file_name = "consumer_complaints_full_lem.csv"
column_name = "processed_text_lem"

# 1. CARREGAR E PROCESSAR O DATASET COMPLETO

In [None]:
if not os.path.exists(file_name):
    corpus = []
    for i, chunk in enumerate(pd.read_csv('complaints.csv', chunksize=chunksize)):
        chunk = chunk.dropna()  # Remove NaN
        chunk.loc[:, column_name] = [
            preprocess_lemmatization(review) 
            for review in tqdm(chunk["narrative"], desc=f"Processing chunk {i+1}")
        ]
        corpus.append(chunk)
        print(f"Processed chunk {i+1}...")
    
    # Concatenar todos os chunks
    data_processed = pd.concat(corpus, ignore_index=True)
    data_processed.to_csv(file_name, index=False)
    print(f"Corpus completo salvo como '{file_name}'.")
else:
    data_processed = pd.read_csv(file_name)
    print(f"Carregado '{file_name}' existente.")

NameError: name 'os' is not defined

In [None]:
# Exibir informações do dataset
print("\nTotal de linhas no dataset completo:", len(data_processed))
print("Distribuição inicial das classes:")
print(data_processed["product_5"].value_counts())

# 2. VETORIZAÇÃO (TF-IDF com N-gramas (1,3))

In [None]:
print("Vetorizando o dataset completo com TF-IDF...")
tfidf = TfidfVectorizer(max_features=4000, ngram_range=(1, 3), stop_words=list(sw))
X = tfidf.fit_transform(data_processed[column_name])
y = data_processed["product_5"]
joblib.dump(tfidf, 'tfidf_vectorizer_full.joblib')
print(f"Tamanho do espaço de features: {X.shape}")

In [None]:
'''# Visualização com PCA antes do SMOTE
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())
df_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
df_pca["Class"] = y.values

plt.figure()
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_pca, alpha=0.6)
plt.title("Distribuição das Classes Antes do SMOTE (Dataset Completo)")
plt.legend()
plt.tight_layout()
plt.savefig("smote_before_full_pca.png")
plt.show()'''

# 3. OVERSAMPLING COM SMOTE

In [None]:
smote = SMOTE(random_state=42, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X, y)
print("\nDistribuição das classes após SMOTE:")
print(pd.Series(y_smote).value_counts())

In [None]:
'''# Visualização com PCA após o SMOTE
X_smote_pca = pca.transform(X_smote.toarray())
df_smote_pca = pd.DataFrame(X_smote_pca, columns=["PC1", "PC2"])
df_smote_pca["Class"] = y_smote

plt.figure()
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_smote_pca, alpha=0.6)
plt.title("Distribuição das Classes Após o SMOTE (Dataset Completo)")
plt.legend()
plt.tight_layout()
plt.savefig("smote_after_full_pca.png")
plt.show()'''

# 4. TREINAR E AVALIAR COM LOGISTIC REGRESSION L2

In [None]:
def train_and_evaluate_lr(X, y, scenario_name):    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    lr_l2 = LogisticRegression(penalty='l2', max_iter=1000, solver='liblinear', random_state=42)
    lr_l2.fit(X_train, y_train)
    y_pred = lr_l2.predict(X_test)
    print(f"Logistic Regression ({scenario_name}):\n", classification_report(y_test, y_pred, zero_division=0))
    joblib.dump(lr_l2, f'lr_l2_{scenario_name.lower().replace(" ", "_")}_3percent.joblib')

train_and_evaluate_lr(X_smote, y_smote, "SMOTE Oversampling Full Dataset")