In [5]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.tokenize import regexp_tokenize
import os
import joblib
import numpy as np
import spacy
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [6]:
# Downloads necessários do NLTK
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Configurações gerais para visualização
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [7]:
# Carregar o modelo do spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Stopwords
sw = set(stopwords.words("english"))

# 1. CARREGAR O DATASET COMPLETO

In [8]:
chunksize = 100000  # Processar em lotes de 100.000 linhas
data_chunks = []
for chunk in pd.read_csv('complaints.csv', chunksize=chunksize):
    chunk = chunk.dropna()
    data_chunks.append(chunk)
data = pd.concat(data_chunks, ignore_index=True)

print("\nTotal de linhas no dataset completo:", len(data))
print("Distribuição inicial das classes:")
print(data["product_5"].value_counts())


Total de linhas no dataset completo: 1757155
Distribuição inicial das classes:
product_5
Credit Reporting              1160112
Debt Collection                266514
Credit Card Services           140699
Bank Accounts and Services      99530
Loans                           90300
Name: count, dtype: int64


# 2. PRÉ-PROCESSAMENTO (Lemmatization)

In [9]:
REGEX_PATTERN = r'''(?x)  # Retirado do Notebook do Professor "preprocessing.ipynb"
    (?:[A-Z]\.)+         
    | (?!x+\b)\w+(?:-\w+)*  # Exclui palavras só com "x"
    | \$?\d+(?:\.\d+)?%? 
    | \.\.\.             
    | [][.,;"'?():-_`]  
'''

In [10]:
def preprocess_lemmatization(text):
    text = str(text) if pd.notna(text) else ''
    tokens = regexp_tokenize(text.lower(), REGEX_PATTERN)
    doc = nlp(' '.join(tokens))
    tokens = [token.lemma_ for token in doc if token.text not in sw and token.is_alpha]
    return ' '.join(tokens)

In [11]:
from tqdm import tqdm  # For progress tracking

file_name = "consumer_complaints_full_lem.csv"
column_name = "processed_text_lem"
chunksize = 100000

corpus = []
for i, chunk in enumerate(pd.read_csv('complaints.csv', chunksize=chunksize)):
    
    # Use .loc to avoid SettingWithCopyWarning and add progress bar
    chunk.loc[:, column_name] = [
        preprocess_lemmatization(review) 
        for review in tqdm(chunk["narrative"], desc=f"Processing chunk {i+1}")
    ]
    
    corpus.append(chunk)
    print(f"Processed chunk {i+1}...")

# Concatenate all chunks into a single DataFrame
data_processed = pd.concat(corpus, ignore_index=True)

# Save to CSV
data_processed.to_csv(file_name, index=False)
print(f"Corpus completo saved as '{file_name}'.")

Processing chunk 1: 100%|██████████| 100000/100000 [15:06<00:00, 110.34it/s]


Processed chunk 1...


Processing chunk 2: 100%|██████████| 100000/100000 [16:16<00:00, 102.38it/s]


Processed chunk 2...


Processing chunk 3: 100%|██████████| 100000/100000 [15:54<00:00, 104.75it/s]


Processed chunk 3...


Processing chunk 4: 100%|██████████| 100000/100000 [15:36<00:00, 106.78it/s]


Processed chunk 4...


Processing chunk 5: 100%|██████████| 100000/100000 [15:54<00:00, 104.82it/s]


Processed chunk 5...


Processing chunk 6: 100%|██████████| 100000/100000 [16:15<00:00, 102.54it/s]


Processed chunk 6...


Processing chunk 7: 100%|██████████| 100000/100000 [16:49<00:00, 99.03it/s]


Processed chunk 7...


Processing chunk 8: 100%|██████████| 100000/100000 [16:19<00:00, 102.07it/s]


Processed chunk 8...


Processing chunk 9: 100%|██████████| 100000/100000 [15:54<00:00, 104.75it/s]


Processed chunk 9...


Processing chunk 10: 100%|██████████| 100000/100000 [15:52<00:00, 104.94it/s]


Processed chunk 10...


Processing chunk 11: 100%|██████████| 100000/100000 [16:44<00:00, 99.57it/s] 


Processed chunk 11...


Processing chunk 12: 100%|██████████| 100000/100000 [16:59<00:00, 98.07it/s]


Processed chunk 12...


Processing chunk 13: 100%|██████████| 100000/100000 [16:48<00:00, 99.18it/s]


Processed chunk 13...


Processing chunk 14: 100%|██████████| 100000/100000 [16:23<00:00, 101.63it/s]


Processed chunk 14...


Processing chunk 15: 100%|██████████| 100000/100000 [16:39<00:00, 100.02it/s]


Processed chunk 15...


Processing chunk 16: 100%|██████████| 100000/100000 [16:34<00:00, 100.58it/s]


Processed chunk 16...


Processing chunk 17: 100%|██████████| 100000/100000 [16:45<00:00, 99.47it/s]


Processed chunk 17...


Processing chunk 18: 100%|██████████| 100000/100000 [17:06<00:00, 97.43it/s]


Processed chunk 18...


Processing chunk 19: 100%|██████████| 100000/100000 [17:09<00:00, 97.16it/s]


Processed chunk 19...


Processing chunk 20: 100%|██████████| 100000/100000 [17:16<00:00, 96.44it/s]


Processed chunk 20...


Processing chunk 21: 100%|██████████| 23066/23066 [04:20<00:00, 88.63it/s] 


Processed chunk 21...
Corpus completo saved as 'consumer_complaints_full_lem.csv'.


# 3. VETORIZAÇÃO (TF-IDF com N-gramas (1,3))

In [12]:
print("Vetorizando o dataset completo com TF-IDF...")
tfidf = TfidfVectorizer(max_features=4000, ngram_range=(1, 3), stop_words=list(sw))
X = tfidf.fit_transform(data_processed[column_name])
y = data_processed["product_5"]
joblib.dump(tfidf, 'tfidf_vectorizer_full.joblib')
print(f"Tamanho do espaço de features: {X.shape}")

Vetorizando o dataset completo com TF-IDF...
Tamanho do espaço de features: (2023066, 4000)


In [13]:
# Visualização com PCA antes do SMOTE
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())  # Pode ser necessário amostrar se for muito grande
df_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
df_pca["Class"] = y.values

plt.figure()
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_pca, alpha=0.6)
plt.title("Distribuição das Classes Antes do SMOTE (Dataset Completo)")
plt.legend()
plt.tight_layout()
plt.savefig("smote_before_full_pca.png")
plt.show()

MemoryError: Unable to allocate 60.3 GiB for an array with shape (2023066, 4000) and data type float64

# 4. OVERSAMPLING COM SMOTE APÓS O TF-IDF

In [14]:
smote = SMOTE(random_state=42, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X, y)

print("\nDistribuição das classes após SMOTE:")
print(pd.Series(y_smote).value_counts())


Distribuição das classes após SMOTE:
product_5
Credit Reporting              1205275
Debt Collection               1205275
Loans                         1205275
Bank Accounts and Services    1205275
Credit Card Services          1205275
Name: count, dtype: int64


In [None]:
# Visualização com PCA após o SMOTE
X_smote_pca = pca.transform(X_smote.toarray())
df_smote_pca = pd.DataFrame(X_smote_pca, columns=["PC1", "PC2"])
df_smote_pca["Class"] = y_smote

plt.figure()
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_smote_pca, alpha=0.6)
plt.title("Distribuição das Classes Após o SMOTE (Dataset Completo)")
plt.legend()
plt.tight_layout()
plt.savefig("smote_after_full_pca.png")
plt.show()

# 5. TREINAR E AVALIAÇÃO COM Logistic Regression

In [15]:
def train_and_evaluate_lr(X, y, scenario_name):    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Modelo LogisticRegression com L2
    lr_l2 = LogisticRegression(penalty='l2', max_iter=1000, solver='liblinear', random_state=42)
    lr_l2.fit(X_train, y_train)
    y_pred_enc = lr_l2.predict(X_test)
    
    print(f"Logistic Regression ({scenario_name}):\n", classification_report(y_test, y_pred_enc, zero_division=0))
    joblib.dump(lr_l2, f'lr_l2_{scenario_name.lower().replace(" ", "_")}_3percent.joblib')

train_and_evaluate_lr(X_smote, y_smote, "SMOTE Oversampling Full Dataset")

Logistic Regression (SMOTE Oversampling Full Dataset):
                             precision    recall  f1-score   support

Bank Accounts and Services       0.88      0.91      0.89    240982
      Credit Card Services       0.86      0.83      0.84    241025
          Credit Reporting       0.87      0.85      0.86    241234
           Debt Collection       0.84      0.84      0.84    241267
                     Loans       0.89      0.89      0.89    240767

                  accuracy                           0.87   1205275
                 macro avg       0.87      0.87      0.87   1205275
              weighted avg       0.87      0.87      0.87   1205275

