In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [2]:
# Built-in libraries
import re
import string
import warnings

# Third-party libraries for data handling and processing
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from tqdm import tqdm

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import KeyedVectors

# Pre-processing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, MaxAbsScaler
from imblearn.over_sampling import SMOTE

# Model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Machine Learning Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


# Deep Learning Libraries
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, LSTM
from keras.callbacks import EarlyStopping

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt


# Miscellaneous
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tqdm.pandas()
warnings.filterwarnings("ignore")


from google.colab import drive
drive.mount('/content/drive/')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive/


In [11]:
# files
file_path_01 = '/content/drive/My Drive/Colab Notebooks/assets/complaints.csv'

file_path_03 = '/content/drive/My Drive/Colab Notebooks/assets/GoogleNews-vectors-negative300.bin.gz'

# import DATA
DATA = pd.read_csv(file_path_01)

print(DATA.shape)

(4028530, 18)


In [12]:
# import word2vec model (this take a while to load)
word2vec = KeyedVectors.load_word2vec_format(file_path_03, binary=True)

In [13]:
# helper func: preprocess the narrative column

def preprocess_narrative(text):

    # Lowercase
    text = text.lower()

    # Remove XXXX like pattern
    text = re.sub(r'x{2,}', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove leading and trailing spaces
    text = text.strip()

    return text

### Approach: Mixture of categorical columns and TFIDF w2v matrix

In [14]:
DATA.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [15]:
dfm = DATA.copy()

# drop na on Consumer complaint narrative column, Consumer disputed? column
dfm.dropna(subset=['Consumer complaint narrative', 'Consumer disputed?'], inplace=True)

# drop Tags, ZIP code, Complaint ID, Timely response? columns
dfm.drop(['ZIP code', 'Complaint ID', 'Timely response?'], axis=1, inplace=True)

# dfm unique values
dfm.nunique()

Date received                      766
Product                             12
Sub-product                         46
Issue                               91
Sub-issue                           57
Consumer complaint narrative    160940
Company public response             10
Company                           3148
State                               62
Tags                                 3
Consumer consent provided?           1
Submitted via                        1
Date sent to company               862
Company response to consumer         5
Consumer disputed?                   2
dtype: int64

In [16]:
# drop Consumer consent provided? column, submitted via column. no variance.

dfm.drop(['Consumer consent provided?', 'Submitted via'], axis=1, inplace=True)

dfm.nunique()

Date received                      766
Product                             12
Sub-product                         46
Issue                               91
Sub-issue                           57
Consumer complaint narrative    160940
Company public response             10
Company                           3148
State                               62
Tags                                 3
Date sent to company               862
Company response to consumer         5
Consumer disputed?                   2
dtype: int64

In [17]:
# convert date received and date sent to company to datetime format, and compute date sent - date received, store in column 'days' as int

dfm['Date received'] = pd.to_datetime(dfm['Date received'])
dfm['Date sent to company'] = pd.to_datetime(dfm['Date sent to company'])

dfm['days'] = (dfm['Date sent to company'] - dfm['Date received']).dt.days

# drop Date received and Date sent to company columns
dfm.drop(['Date received', 'Date sent to company'], axis=1, inplace=True)
dfm.nunique()

Product                             12
Sub-product                         46
Issue                               91
Sub-issue                           57
Consumer complaint narrative    160940
Company public response             10
Company                           3148
State                               62
Tags                                 3
Company response to consumer         5
Consumer disputed?                   2
days                               269
dtype: int64

In [18]:
# fill na with 'Unknown' in all columns

dfm.fillna('Unknown', inplace=True)

dfm.isna().sum()

Product                         0
Sub-product                     0
Issue                           0
Sub-issue                       0
Consumer complaint narrative    0
Company public response         0
Company                         0
State                           0
Tags                            0
Company response to consumer    0
Consumer disputed?              0
days                            0
dtype: int64

In [19]:
# encode disputed column to 0 and 1

dfm['Consumer disputed?'] = dfm['Consumer disputed?'].map({'No': 0, 'Yes': 1})

dfm['Consumer disputed?'].value_counts()

0    128227
1     35807
Name: Consumer disputed?, dtype: int64

In [20]:
# preprocess narrative column and store in narrative_processed column

dfm['narrative_processed'] = dfm['Consumer complaint narrative'].progress_apply(preprocess_narrative)

100%|██████████| 164034/164034 [00:16<00:00, 10068.13it/s]


In [22]:
# instantiate tfidf

tfidf = TfidfVectorizer()

# fit and transform tfidf vectorizer on narrative_processed column
tfidf.fit(dfm.narrative_processed)

# get tfidf feature names
tfidf_features = tfidf.get_feature_names_out()

# get tfidf weights
tfidf_weights = tfidf.idf_


In [23]:
# define a function get mean word2vec vector for a narrative

def get_mean_word2vec(narrative):

    # initialize vector
    vector = np.zeros(300)

    # get all words in narrative
    words = narrative.split()
    num_words = len(words)

    if num_words == 0:  # edge case: empty narrative
        return vector

    # calculate word vectors using list comprehension
    word_vectors = [word2vec[word] for word in words if word in word2vec]

    if word_vectors:
        vector = np.mean(word_vectors, axis=0)

    return vector


# define a function get mean tfidf weighted word2vec vector for a narrative
def get_mean_tfidf_weighted_word2vec(narrative):

    # initialize vector
    vector = np.zeros(300)

    # get all words in narrative
    words = narrative.split()
    num_words = len(words)

    if num_words == 0:  # edge case: empty narrative
        return vector

    # pre-calculate word-to-index mapping for tfidf_features for O(1) lookup
    word_to_index = {word: idx for idx, word in enumerate(tfidf_features)}

    # calculate word vectors using list comprehension
    word_vectors = [
        word2vec[word] * tfidf_weights[word_to_index[word]]
        for word in words if word in word2vec and word in word_to_index
    ]

    if word_vectors:
        vector = np.sum(word_vectors, axis=0) / num_words

    return vector

In [24]:

# compute mean tfidf weighted word2vec vector for each narrative (take a while)
dfm['narrative_tw'] = dfm['narrative_processed'].progress_apply(get_mean_tfidf_weighted_word2vec)

# convert dfm['narrative_tw']  to numpy array tw_array
tw_array = np.array(dfm.narrative_tw.tolist())

# print shape of tw_array
tw_array.shape

100%|██████████| 164034/164034 [54:56<00:00, 49.77it/s]


(164034, 300)

In [25]:
# copy target column to y
y = dfm['Consumer disputed?'].copy()

In [26]:
dfm.columns

Index(['Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'Tags', 'Company response to consumer', 'Consumer disputed?',
       'days', 'narrative_processed', 'narrative_tw'],
      dtype='object')

In [27]:
# drop Consumer complaint narrative, narrative_processed, Consumer disputed? columns

dfm.drop(['Consumer complaint narrative', 'narrative_processed', 'narrative_tw', 'Consumer disputed?'], axis=1, inplace=True)


In [28]:
dfm.nunique()

Product                           12
Sub-product                       47
Issue                             91
Sub-issue                         58
Company public response           11
Company                         3148
State                             63
Tags                               4
Company response to consumer       5
days                             269
dtype: int64

In [29]:
# one hot encode categorical columns

cats_cols = [
    'Product',
    'Sub-product',
    'Issue',
    'Sub-issue',
    'Company public response',
    'Company',
    'State',
    'Tags',
    'Company response to consumer',
]


dfm = pd.get_dummies(dfm, columns=cats_cols)

print(dfm.columns)

print(dfm.shape)

Index(['days', 'Product_Bank account or service', 'Product_Consumer Loan',
       'Product_Credit card', 'Product_Credit reporting',
       'Product_Debt collection', 'Product_Money transfers',
       'Product_Mortgage', 'Product_Other financial service',
       'Product_Payday loan',
       ...
       'State_WY', 'Tags_Older American', 'Tags_Older American, Servicemember',
       'Tags_Servicemember', 'Tags_Unknown',
       'Company response to consumer_Closed',
       'Company response to consumer_Closed with explanation',
       'Company response to consumer_Closed with monetary relief',
       'Company response to consumer_Closed with non-monetary relief',
       'Company response to consumer_Untimely response'],
      dtype='object', length=3440)
(164034, 3440)


In [30]:
# concatenate tw_array with dfm
X = np.concatenate((tw_array, dfm), axis=1)

# check shape of X
X.shape

(164034, 3740)

In [31]:
# smoote to oversample minority class
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# print shape of X_smote, y_smote
print(X_smote.shape, y_smote.shape)

(256454, 3740) (256454,)


In [32]:
# train test split, 20% test, random_state=42, stratify=y_smote
X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, stratify=y_smote, random_state=42, test_size=0.2)

# print shape
print(X_smote_train.shape, X_smote_test.shape, y_smote_train.shape, y_smote_test.shape)

(205163, 3740) (51291, 3740) (205163,) (51291,)


In [33]:
# train with logistic regression, wrap in a function

def train_lr(X_train, y_train, X_test, y_test):

    # instantiate logistic regression model
    lr = LogisticRegression(max_iter=1000, random_state=42).fit(X_train, y_train)

    # predict on test set
    y_pred = lr.predict(X_test)

    # print f1 score
    print(f"f1 score: {f1_score(y_test, y_pred)}")

    return lr


# train naive bayes

def train_naive_bayes(X_train, y_train, X_test, y_test):

    # instantiate naive bayes model
    nb = MultinomialNB()

    # fit model
    nb.fit(X_train, y_train)

    # predict on test set
    y_pred = nb.predict(X_test)

    # print f1 score
    print(f"f1 score: {f1_score(y_test, y_pred)}")

    return nb


# train lightgbm

def train_lgbm(X_train, y_train, X_test, y_test):

    # instantiate lightgbm model
    lgbm = LGBMClassifier(random_state=42).fit(X_train, y_train)

    # predict on test set
    y_pred = lgbm.predict(X_test)

    # print f1 score
    print(f"f1 score: {f1_score(y_test, y_pred)}")

    return lgbm


# train with xgboost

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="Starting in XGBoost")

def train_xgb(X_train, y_train, X_test, y_test):

    # instantiate xgboost model
    xgb = XGBClassifier(random_state=42, n_jobs=-1)

    # fit model
    xgb.fit(X_train, y_train)

    # predict on test set
    y_pred = xgb.predict(X_test)

    # print f1 score
    print(f"f1 score: {f1_score(y_test, y_pred)}")

    return xgb


# catboost in a function

def train_catboost(X_train, y_train, X_test, y_test):

    # instantiate catboost model
    catboost = CatBoostClassifier(random_state=42)

    # fit model
    catboost.fit(X_train, y_train)

    # predict on test set
    y_pred = catboost.predict(X_test)

    # print accuracy score
    print(f"accuracy score: {accuracy_score(y_test, y_pred)}")

    # print f1 score
    print(f"f1 score: {f1_score(y_test, y_pred)}")

    return catboost


# train ann

def train_ann_tw_onehot(X_train, y_train, X_test, y_test):

    # instantiate ann model
    ann = Sequential()

    # add input layer
    ann.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))

    # add hidden layers
    ann.add(Dense(64, activation='relu'))
    ann.add(Dense(32, activation='relu'))
    ann.add(Dense(16, activation='relu'))

    # add output layer
    ann.add(Dense(1, activation='sigmoid'))

    # compile model
    ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # fit model
    ann.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # predict on test set
    y_pred = ann.predict(X_test)

    # print f1 score
    print(f"f1 score: {f1_score(y_test, y_pred.round())}")

    return ann


In [34]:
# train with logistic regression

model_lr = train_lr(X_smote_train, y_smote_train, X_smote_test, y_smote_test)

f1 score: 0.661993306061733


In [35]:
# train with naive bayes, scale values as nb not working with negative

min_max_scaler = MinMaxScaler()

model_nb = train_naive_bayes(min_max_scaler.fit_transform(X_smote_train), y_smote_train, min_max_scaler.transform(X_smote_test), y_smote_test)

f1 score: 0.6275373134328358


In [36]:
# train lightgbm

model_lgbm = train_lgbm(X_smote_train, y_smote_train, X_smote_test, y_smote_test)

[LightGBM] [Info] Number of positive: 102582, number of negative: 102581
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 138687
[LightGBM] [Info] Number of data points in the train set: 205163, number of used features: 1395
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initscore=0.000010
[LightGBM] [Info] Start training from score 0.000010
f1 score: 0.8021170370052925


In [37]:
# train catboost

model_catboost = train_catboost(X_smote_train, y_smote_train, X_smote_test, y_smote_test)

Learning rate set to 0.100046
0:	learn: 0.6778634	total: 139ms	remaining: 2m 19s
1:	learn: 0.6672408	total: 224ms	remaining: 1m 51s
2:	learn: 0.6608011	total: 310ms	remaining: 1m 43s
3:	learn: 0.6471036	total: 396ms	remaining: 1m 38s
4:	learn: 0.6419977	total: 487ms	remaining: 1m 36s
5:	learn: 0.6374393	total: 569ms	remaining: 1m 34s
6:	learn: 0.6336227	total: 653ms	remaining: 1m 32s
7:	learn: 0.6178641	total: 735ms	remaining: 1m 31s
8:	learn: 0.6149668	total: 814ms	remaining: 1m 29s
9:	learn: 0.6082395	total: 899ms	remaining: 1m 28s
10:	learn: 0.6056475	total: 983ms	remaining: 1m 28s
11:	learn: 0.5951178	total: 1.06s	remaining: 1m 27s
12:	learn: 0.5926962	total: 1.15s	remaining: 1m 27s
13:	learn: 0.5868123	total: 1.24s	remaining: 1m 27s
14:	learn: 0.5848577	total: 1.32s	remaining: 1m 26s
15:	learn: 0.5829106	total: 1.4s	remaining: 1m 25s
16:	learn: 0.5812108	total: 1.47s	remaining: 1m 25s
17:	learn: 0.5772455	total: 1.55s	remaining: 1m 24s
18:	learn: 0.5755182	total: 1.63s	remaining: 

In [38]:
# train xgboost

model_xgb = train_xgb(X_smote_train, y_smote_train, X_smote_test, y_smote_test)

f1 score: 0.8097169377430279


In [39]:
# train ann

model_ann = train_ann_tw_onehot(X_smote_train, y_smote_train, X_smote_test, y_smote_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
f1 score: 0.7903620965719466
