In [None]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.tokenize import regexp_tokenize
import os
import joblib
import spacy
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [3]:
# Configurações gerais para visualização
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Carregar o modelo do spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Stopwords
sw = set(stopwords.words("english"))

In [4]:
REGEX_PATTERN = r'''(?x)  # Retirado do Notebook do Professor "preprocessing.ipynb"
    (?:[A-Z]\.)+         
    | (?!x+\b)\w+(?:-\w+)*  # Exclui palavras só com "x"
    | \$?\d+(?:\.\d+)?%? 
    | \.\.\.             
    | [][.,;"'?():-_`]  
'''

In [5]:
def preprocess_lemmatization(text):
    text = str(text) if pd.notna(text) else ''
    tokens = regexp_tokenize(text.lower(), REGEX_PATTERN)
    doc = nlp(' '.join(tokens))
    tokens = [token.lemma_ for token in doc if token.text not in sw and token.is_alpha]
    return ' '.join(tokens)

In [6]:
# Parâmetros
chunksize = 100000
file_name = "consumer_complaints_full_lem.csv"
column_name = "processed_text_lem"

# 1. CARREGAR E PROCESSAR O DATASET COMPLETO

In [7]:
if not os.path.exists(file_name):
    corpus = []
    for i, chunk in enumerate(pd.read_csv('complaints.csv', chunksize=chunksize)):
        chunk = chunk.dropna()  # Remove NaN
        chunk.loc[:, column_name] = [
            preprocess_lemmatization(review) 
            for review in tqdm(chunk["narrative"], desc=f"Processing chunk {i+1}")
        ]
        corpus.append(chunk)
        print(f"Processed chunk {i+1}...")
    
    # Concatenar todos os chunks
    data_processed = pd.concat(corpus, ignore_index=True)
    data_processed.to_csv(file_name, index=False)
    print(f"Corpus completo salvo como '{file_name}'.")
else:
    data_processed = pd.read_csv(file_name)
    print(f"Carregado '{file_name}' existente.")

Processing chunk 1: 100%|██████████| 96979/96979 [14:27<00:00, 111.78it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 1...


Processing chunk 2: 100%|██████████| 95151/95151 [14:55<00:00, 106.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 2...


Processing chunk 3: 100%|██████████| 94205/94205 [14:21<00:00, 109.34it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 3...


Processing chunk 4: 100%|██████████| 92633/92633 [13:50<00:00, 111.60it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 4...


Processing chunk 5: 100%|██████████| 93535/93535 [14:14<00:00, 109.41it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 5...


Processing chunk 6: 100%|██████████| 91932/91932 [14:11<00:00, 107.94it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 6...


Processing chunk 7: 100%|██████████| 90606/90606 [14:24<00:00, 104.83it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 7...


Processing chunk 8: 100%|██████████| 91648/91648 [14:08<00:00, 108.06it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 8...


Processing chunk 9: 100%|██████████| 92556/92556 [13:57<00:00, 110.52it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 9...


Processing chunk 10: 100%|██████████| 91931/91931 [13:44<00:00, 111.56it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 10...


Processing chunk 11: 100%|██████████| 89461/89461 [13:57<00:00, 106.77it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 11...


Processing chunk 12: 100%|██████████| 88531/88531 [13:53<00:00, 106.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 12...


Processing chunk 13: 100%|██████████| 87475/87475 [13:24<00:00, 108.67it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 13...


Processing chunk 14: 100%|██████████| 90065/90065 [13:27<00:00, 111.52it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 14...


Processing chunk 15: 100%|██████████| 88817/88817 [13:38<00:00, 108.51it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 15...


Processing chunk 16: 100%|██████████| 88481/88481 [13:40<00:00, 107.80it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 16...


Processing chunk 17: 100%|██████████| 86010/86010 [13:26<00:00, 106.66it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 17...


Processing chunk 18: 100%|██████████| 71350/71350 [11:10<00:00, 106.43it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 18...


Processing chunk 19: 100%|██████████| 56564/56564 [08:33<00:00, 110.16it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 19...


Processing chunk 20: 100%|██████████| 58685/58685 [09:01<00:00, 108.28it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 20...


Processing chunk 21: 100%|██████████| 20540/20540 [03:20<00:00, 102.52it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, column_name] = [


Processed chunk 21...
Corpus completo salvo como 'consumer_complaints_full_lem.csv'.


In [8]:
# Exibir informações do dataset
print("\nTotal de linhas no dataset completo:", len(data_processed))
print("Distribuição inicial das classes:")
print(data_processed["product_5"].value_counts())


Total de linhas no dataset completo: 1757155
Distribuição inicial das classes:
product_5
Credit Reporting              1160112
Debt Collection                266514
Credit Card Services           140699
Bank Accounts and Services      99530
Loans                           90300
Name: count, dtype: int64


# 2. VETORIZAÇÃO (TF-IDF com N-gramas (1,3))

In [9]:
print("Vetorizando o dataset completo com TF-IDF...")
tfidf = TfidfVectorizer(max_features=4000, ngram_range=(1, 3), stop_words=list(sw))
X = tfidf.fit_transform(data_processed[column_name])
y = data_processed["product_5"]
joblib.dump(tfidf, 'tfidf_vectorizer_full.joblib')
print(f"Tamanho do espaço de features: {X.shape}")

Vetorizando o dataset completo com TF-IDF...
Tamanho do espaço de features: (1757155, 4000)


In [10]:
'''# Visualização com PCA antes do SMOTE
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())
df_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
df_pca["Class"] = y.values

plt.figure()
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_pca, alpha=0.6)
plt.title("Distribuição das Classes Antes do SMOTE (Dataset Completo)")
plt.legend()
plt.tight_layout()
plt.savefig("smote_before_full_pca.png")
plt.show()'''

'# Visualização com PCA antes do SMOTE\npca = PCA(n_components=2)\nX_pca = pca.fit_transform(X.toarray())\ndf_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])\ndf_pca["Class"] = y.values\n\nplt.figure()\nsns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_pca, alpha=0.6)\nplt.title("Distribuição das Classes Antes do SMOTE (Dataset Completo)")\nplt.legend()\nplt.tight_layout()\nplt.savefig("smote_before_full_pca.png")\nplt.show()'

# 3. OVERSAMPLING COM SMOTE

In [11]:
smote = SMOTE(random_state=42, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X, y)
print("\nDistribuição das classes após SMOTE:")
print(pd.Series(y_smote).value_counts())


Distribuição das classes após SMOTE:
product_5
Credit Reporting              1160112
Debt Collection               1160112
Loans                         1160112
Bank Accounts and Services    1160112
Credit Card Services          1160112
Name: count, dtype: int64


In [12]:
'''# Visualização com PCA após o SMOTE
X_smote_pca = pca.transform(X_smote.toarray())
df_smote_pca = pd.DataFrame(X_smote_pca, columns=["PC1", "PC2"])
df_smote_pca["Class"] = y_smote

plt.figure()
sns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_smote_pca, alpha=0.6)
plt.title("Distribuição das Classes Após o SMOTE (Dataset Completo)")
plt.legend()
plt.tight_layout()
plt.savefig("smote_after_full_pca.png")
plt.show()'''

'# Visualização com PCA após o SMOTE\nX_smote_pca = pca.transform(X_smote.toarray())\ndf_smote_pca = pd.DataFrame(X_smote_pca, columns=["PC1", "PC2"])\ndf_smote_pca["Class"] = y_smote\n\nplt.figure()\nsns.scatterplot(x="PC1", y="PC2", hue="Class", data=df_smote_pca, alpha=0.6)\nplt.title("Distribuição das Classes Após o SMOTE (Dataset Completo)")\nplt.legend()\nplt.tight_layout()\nplt.savefig("smote_after_full_pca.png")\nplt.show()'

# 4. TREINAR E AVALIAR COM LOGISTIC REGRESSION L2

In [13]:
def train_and_evaluate_lr(X, y, scenario_name):    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    lr_l2 = LogisticRegression(penalty='l2', max_iter=1000, solver='liblinear', random_state=42)
    lr_l2.fit(X_train, y_train)
    y_pred = lr_l2.predict(X_test)
    print(f"Logistic Regression ({scenario_name}):\n", classification_report(y_test, y_pred, zero_division=0))
    joblib.dump(lr_l2, f'lr_l2_{scenario_name.lower().replace(" ", "_")}_3percent.joblib')

train_and_evaluate_lr(X_smote, y_smote, "SMOTE Oversampling Full Dataset")

Logistic Regression (SMOTE Oversampling Full Dataset):
                             precision    recall  f1-score   support

Bank Accounts and Services       0.89      0.92      0.91    231263
      Credit Card Services       0.87      0.83      0.85    232681
          Credit Reporting       0.87      0.85      0.86    232040
           Debt Collection       0.85      0.82      0.84    232277
                     Loans       0.85      0.90      0.87    231851

                  accuracy                           0.87   1160112
                 macro avg       0.87      0.87      0.87   1160112
              weighted avg       0.87      0.87      0.87   1160112

