In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('complaints.csv')

  df = pd.read_csv('complaints.csv')


In [3]:
df.head(2).T

Unnamed: 0,0,1
Date received,2023-11-18,2023-11-18
Product,Credit reporting or other personal consumer re...,Credit reporting or other personal consumer re...
Sub-product,Credit reporting,Credit reporting
Issue,Incorrect information on your report,Incorrect information on your report
Sub-issue,Account status incorrect,Information belongs to someone else
Consumer complaint narrative,,
Company public response,,
Company,"EQUIFAX, INC.","EQUIFAX, INC."
State,MI,GA
ZIP code,49037,30052


In [4]:
# leave only Product and Consumer complaint narrative

df = df[['Product', 'Consumer complaint narrative']]

df.shape

(4310354, 2)

In [5]:
pd.DataFrame(df.Product.unique()).values

array([['Credit reporting or other personal consumer reports'],
       ['Debt collection'],
       ['Credit card'],
       ['Checking or savings account'],
       ['Payday loan, title loan, personal loan, or advance loan'],
       ['Prepaid card'],
       ['Student loan'],
       ['Mortgage'],
       ['Money transfer, virtual currency, or money service'],
       ['Vehicle loan or lease'],
       ['Debt or credit management'],
       ['Credit reporting, credit repair services, or other personal consumer reports'],
       ['Payday loan, title loan, or personal loan'],
       ['Credit card or prepaid card'],
       ['Credit reporting'],
       ['Bank account or service'],
       ['Consumer Loan'],
       ['Payday loan'],
       ['Other financial service'],
       ['Money transfers'],
       ['Virtual currency']], dtype=object)

In [6]:
df.replace({'Product':
             {
              'Credit reporting':'Credit reporting, credit repair services, or other personal consumer reports',
              'Credit reporting, repair, or other':'Credit reporting, credit repair services, or other personal consumer reports',
              'Credit reporting or other personal consumer reports':'Credit reporting, credit repair services, or other personal consumer reports',
              'Credit card': 'Credit card or prepaid card',
              'Prepaid card': 'Credit card or prepaid card',
              'Payday loan': 'Payday loan, title loan, or personal loan',
              'Payday loan, title loan, or personal loan':'Payday loan, title loan, personal loan, or advance loan',
              'Money transfer': 'Money transfer, virtual currency, or money service',
              'Virtual currency': 'Money transfer, virtual currency, or money service',
              'Bank account or service':'Money transfer, virtual currency, or money service',
              'Debt or credit management':'Credit reporting, credit repair services, or other personal consumer reports',
              'Consumer Loan':'Credit reporting, credit repair services, or other personal consumer reports'}},
            inplace= True)

In [7]:
df.Product.value_counts()

Credit reporting, credit repair services, or other personal consumer reports    2582005
Debt collection                                                                  516787
Mortgage                                                                         387297
Credit card or prepaid card                                                      313407
Checking or savings account                                                      189772
Money transfer, virtual currency, or money service                               147209
Student loan                                                                      80271
Vehicle loan or lease                                                             49515
Payday loan, title loan, personal loan, or advance loan                           32138
Payday loan, title loan, or personal loan                                          5541
Money transfers                                                                    5354
Other financial service         

In [8]:
df.head()

Unnamed: 0,Product,Consumer complaint narrative
0,"Credit reporting, credit repair services, or o...",
1,"Credit reporting, credit repair services, or o...",
2,"Credit reporting, credit repair services, or o...",
3,"Credit reporting, credit repair services, or o...",
4,"Credit reporting, credit repair services, or o...",


In [9]:
# number of nulls in each column

df.isnull().sum(), len(df)

(Product                               0
 Consumer complaint narrative    2743355
 dtype: int64,
 4310354)

In [10]:
df['category_id'] = df['Product'].factorize()[0]

df.head()

Unnamed: 0,Product,Consumer complaint narrative,category_id
0,"Credit reporting, credit repair services, or o...",,0
1,"Credit reporting, credit repair services, or o...",,0
2,"Credit reporting, credit repair services, or o...",,0
3,"Credit reporting, credit repair services, or o...",,0
4,"Credit reporting, credit repair services, or o...",,0


In [11]:
del df['Product']

df.head()

Unnamed: 0,Consumer complaint narrative,category_id
0,,0
1,,0
2,,0
3,,0
4,,0


In [12]:
# filter not null

df2 = df[df['Consumer complaint narrative'].notnull()]

df2.head()

Unnamed: 0,Consumer complaint narrative,category_id
11,I am XXXX XXXXt that was XXXX with the XXXX ...,1
12,I request original copy of account from pro co...,0
58,It has come to my attention. That my personal ...,0
106,"As of today XX/XX/XXXX, XXXX the current balan...",8
108,"XXXX, XXXX XXXX XXXX XXXX at XXXX XXXX XXXX XX...",0


In [13]:
X = df2['Consumer complaint narrative']
y = df2['category_id']

In [14]:
len(X)

1566999

In [15]:
from sklearn.model_selection import train_test_split

X, _, y, _ = train_test_split(X, y, train_size=10_0000)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
from itertools import chain

def clean_text(text):
    return re.sub(r'[^a-zA-Z]', '', text)

X_train_cleaned = [clean_text(text) for text in X_train]
X_test_cleaned = [clean_text(text) for text in X_test]

In [18]:
all_text = ''.join(X_train_cleaned)
vocab = sorted(set(all_text))
char_to_int = {ch: i for i, ch in enumerate(vocab)}

def text_to_int(text):
    return [char_to_int[ch] for ch in text]

X_train_encoded = [text_to_int(text) for text in X_train_cleaned]
X_test_encoded = [text_to_int(text) for text in X_test_cleaned]

max_len = 200

X_train_encoded = [text[:max_len] for text in X_train_encoded]
X_test_encoded = [text[:max_len] for text in X_test_encoded]

In [19]:
max_length = max(len(text) for text in X_train_encoded)
X_train_padded = torch.tensor([i + [0]*(max_length-len(i)) for i in X_train_encoded])
y_train = torch.tensor(y_train.to_numpy())

X_test_padded = torch.tensor([i + [0]*(max_length-len(i)) for i in X_test_encoded])
y_test = torch.tensor(y_test.to_numpy())

max_length, X_train_padded.shape, y_train.shape, X_test_padded.shape, y_test.shape

(200,
 torch.Size([80000, 200]),
 torch.Size([80000]),
 torch.Size([20000, 200]),
 torch.Size([20000]))

In [20]:
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(X_train_padded, y_train)
train_loader = DataLoader(train_data, batch_size=50, shuffle=True)

test_data = TensorDataset(X_test_padded, y_test)
test_loader = DataLoader(test_data, batch_size=50, shuffle=True)

In [21]:
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(embedding_dim, hidden_dim, kernel_size=5)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.relu(x)
        x = torch.max(x, dim=2).values
        x = self.fc(x)
        return x


In [22]:
model = CNNModel(len(vocab), 50, 100, len(set(y))).to('cuda')

In [23]:
def one_epoch(model, criterion, optimizer, dl):
    losses = []
    accs = []

    for x, y in dl:
        x = x.to('cuda')
        y = y.to('cuda')
        
        loss = criterion(model(x), y)

        if optimizer:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        losses.append(loss.item())

        acc = (model(x).argmax(1) == y).float().mean()

        accs.append(acc.item())
    
    return np.mean(losses), np.mean(accs)

In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.004)

In [25]:
from tqdm.notebook import tqdm

epochs = 20

train_losses = []
test_losses = []
accs = []

for epoch in tqdm(range(epochs)):
    model.train()
    train_loss, _ = one_epoch(model, criterion, optimizer, train_loader)

    with torch.no_grad():
        model.eval()
        test_loss, acc = one_epoch(model, criterion, None, test_loader)

    print(f'Epoch {epoch+1} | Train loss: {train_loss:.3f} | Test loss: {test_loss:.3f} | Test acc: {acc:.3f}')

    train_losses.append(train_loss)
    test_losses.append(test_loss)
    accs.append(acc)

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1 | Train loss: 0.804 | Test loss: 0.696 | Test acc: 0.774
Epoch 2 | Train loss: 0.679 | Test loss: 0.654 | Test acc: 0.784
Epoch 3 | Train loss: 0.661 | Test loss: 0.640 | Test acc: 0.788
Epoch 4 | Train loss: 0.648 | Test loss: 0.650 | Test acc: 0.787
Epoch 5 | Train loss: 0.641 | Test loss: 0.646 | Test acc: 0.788
Epoch 6 | Train loss: 0.634 | Test loss: 0.638 | Test acc: 0.790
Epoch 7 | Train loss: 0.627 | Test loss: 0.640 | Test acc: 0.792
Epoch 8 | Train loss: 0.622 | Test loss: 0.644 | Test acc: 0.786
Epoch 9 | Train loss: 0.614 | Test loss: 0.648 | Test acc: 0.792
Epoch 10 | Train loss: 0.610 | Test loss: 0.645 | Test acc: 0.787
Epoch 11 | Train loss: 0.603 | Test loss: 0.644 | Test acc: 0.791
Epoch 12 | Train loss: 0.601 | Test loss: 0.638 | Test acc: 0.791
Epoch 13 | Train loss: 0.597 | Test loss: 0.643 | Test acc: 0.790
Epoch 14 | Train loss: 0.594 | Test loss: 0.641 | Test acc: 0.786
Epoch 15 | Train loss: 0.590 | Test loss: 0.648 | Test acc: 0.790
Epoch 16 | Train lo