In [2]:
# !pip install catboost

In [1]:
# Built-in libraries
import re
import string
import warnings

# Third-party libraries for data handling and processing
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from tqdm import tqdm
import nltk
nltk.download('stopwords')

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import KeyedVectors

# Pre-processing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE

# Model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Machine Learning Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# Deep Learning Libraries
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, LSTM
from keras.callbacks import EarlyStopping

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt


# Miscellaneous
stop_words = set(stopwords.words('english'))
tqdm.pandas()
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# import google drive

# from google.colab import drive
# drive.mount('/content/drive')


[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-09-30 19:35:45.643996: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# google: '/content/drive/My Drive/Colab Notebooks/assets/'

file01 = 'complaints.csv'
file02 = 'GoogleNews-vectors-negative300.bin.gz'

# file01 = '/content/drive/My Drive/Colab Notebooks/assets/complaints.csv'
# file02 = '/content/drive/My Drive/Colab Notebooks/assets/GoogleNews-vectors-negative300.bin.gz'

DATA = pd.read_csv(file01)
# EMBEDDING = KeyedVectors.load_word2vec_format(file02, binary=True)


## functions

In [3]:
# preprocess the narrative column

def preprocess_narrative(text):
    # Lowercase
    text = text.lower()
    
    # Remove XXXX like pattern
    text = re.sub(r'x{2,}', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading and trailing spaces
    text = text.strip()

    return text


# define a function get mean word2vec vector for a narrative

def get_mean_word2vec(word2vec, narrative):

    # initialize vector
    vector = np.zeros(300)

    # get all words in narrative
    words = narrative.split()
    num_words = len(words)

    if num_words == 0:  # edge case: empty narrative
        return vector

    # calculate word vectors using list comprehension
    word_vectors = [word2vec[word] for word in words if word in word2vec]

    if word_vectors:
        vector = np.mean(word_vectors, axis=0)

    return vector


# define a function get mean tfidf weighted word2vec vector for a narrative
def get_mean_tfidf_weighted_word2vec(word2vec, tfidf_features, tfidf_weights, narrative):

    # initialize vector
    vector = np.zeros(300)

    # get all words in narrative
    words = narrative.split()
    num_words = len(words)

    if num_words == 0:  # edge case: empty narrative
        return vector

    # pre-calculate word-to-index mapping for tfidf_features for O(1) lookup
    word_to_index = {word: idx for idx, word in enumerate(tfidf_features)}

    # calculate word vectors using list comprehension
    word_vectors = [
        word2vec[word] * tfidf_weights[word_to_index[word]]
        for word in words if word in word2vec and word in word_to_index
    ]

    if word_vectors:
        vector = np.sum(word_vectors, axis=0) / num_words

    return vector


# define a function to train and evaluate a model

def train_evaluate_model(model, X_train, y_train, X_test, y_test):

    print("Training the model...")
    # print model name
    print(model.__class__.__name__)

    # train the model
    model.fit(X_train, y_train)

    # predict on test data
    y_pred = model.predict(X_test)

    # print f1 score
    print("F1 Score: {:.2f}".format(f1_score(y_test, y_pred)))
    print()

    # print confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print()
    
    # print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    return model

## get fresh copy to df from DATA


In [4]:
df = DATA.copy()

# drop narrative column and disputed column are null
df.dropna(subset=['Consumer complaint narrative', 'Consumer disputed?'], inplace=True)

df.shape

(164034, 18)

In [5]:
# preprocess the narrative column

narrative_processed = df['Consumer complaint narrative'].progress_apply(preprocess_narrative)

  0%|          | 0/164034 [00:00<?, ?it/s]

100%|██████████| 164034/164034 [00:18<00:00, 8811.81it/s] 


In [6]:
y = df['Consumer disputed?']

test: text only, bow vs tf vs w2v vs tfidfw2v

In [6]:
# bow

bow_vectorizer = CountVectorizer(max_features=5000)

bow = bow_vectorizer.fit_transform(narrative_processed)

bow.shape

# tfidf

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

tfidf = tfidf_vectorizer.fit_transform(narrative_processed)

tfidf.shape


(164034, 5000)

In [None]:

# word2vec

word2vec = EMBEDDING

word2vec_vectors = np.array([
    get_mean_word2vec(word2vec, narrative)
    for narrative in tqdm(narrative_processed)
])

word2vec_vectors.shape

# tfidf weighted word2vec

tfidf_features = tfidf_vectorizer.get_feature_names_out()
tfidf_weights = tfidf_vectorizer.idf_

tfidf_weighted_word2vec_vectors = np.array([

    get_mean_tfidf_weighted_word2vec(word2vec, tfidf_features, tfidf_weights, narrative)
    for narrative in tqdm(narrative_processed)

])

tfidf_weighted_word2vec_vectors.shape


In [None]:

# label encode target variable

le = LabelEncoder()

df['Consumer disputed?'] = le.fit_transform(df['Consumer disputed?'])

y = df['Consumer disputed?']

# split data into train and test sets for each feature set

X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow, y, test_size=0.2, random_state=42)

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf, y, test_size=0.2, random_state=42)

X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(word2vec_vectors, y, test_size=0.2, random_state=42)

X_train_tfidf_weighted_word2vec, X_test_tfidf_weighted_word2vec, y_train_tfidf_weighted_word2vec, y_test_tfidf_weighted_word2vec = train_test_split(tfidf_weighted_word2vec_vectors, y, test_size=0.2, random_state=42)

# train and evaluate models for each feature set

# Naive Bayes

print("Naive Bayes")

print()

print("Bag of Words")

nb_bow = train_evaluate_model(MultinomialNB(), X_train_bow, y_train_bow, X_test_bow, y_test_bow)

print()

print("TF-IDF")

nb_tfidf = train_evaluate_model(MultinomialNB(), X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf)

print()

print("Word2Vec")

# Naive Bayes does not support negative values, scale word2vec vectors

scaler = MinMaxScaler()

X_train_word2vec_scaled = scaler.fit_transform(X_train_word2vec)

X_test_word2vec_scaled = scaler.transform(X_test_word2vec)

nb_word2vec = train_evaluate_model(MultinomialNB(), X_train_word2vec_scaled, y_train_word2vec, X_test_word2vec_scaled, y_test_word2vec)

print()

print("TF-IDF Weighted Word2Vec")

# Naive Bayes does not support negative values, scale tfidf weighted word2vec vectors

scaler = MinMaxScaler()

X_train_tfidf_weighted_word2vec_scaled = scaler.fit_transform(X_train_tfidf_weighted_word2vec)

X_test_tfidf_weighted_word2vec_scaled = scaler.transform(X_test_tfidf_weighted_word2vec)

nb_tfidf_weighted_word2vec = train_evaluate_model(MultinomialNB(), X_train_tfidf_weighted_word2vec_scaled, y_train_tfidf_weighted_word2vec, X_test_tfidf_weighted_word2vec_scaled, y_test_tfidf_weighted_word2vec)

print()





In [9]:
# smote to handle class imbalance for each feature set

smote = SMOTE(random_state=42)

X_train_bow_smote, y_train_bow_smote = smote.fit_resample(X_train_bow, y_train_bow)
X_train_tfidf_smote, y_train_tfidf_smote = smote.fit_resample(X_train_tfidf, y_train_tfidf)
X_train_word2vec_smote, y_train_word2vec_smote = smote.fit_resample(X_train_word2vec, y_train_word2vec)
X_train_tfidf_weighted_word2vec_smote, y_train_tfidf_weighted_word2vec_smote = smote.fit_resample(X_train_tfidf_weighted_word2vec, y_train_tfidf_weighted_word2vec)

# train and evaluate models for each feature set with smote

# Naive Bayes

print("Naive Bayes with SMOTE")
print()

print("Bag of Words")
nb_bow_smote = train_evaluate_model(MultinomialNB(), X_train_bow_smote, y_train_bow_smote, X_test_bow, y_test_bow)

print()
print("TF-IDF")
nb_tfidf_smote = train_evaluate_model(MultinomialNB(), X_train_tfidf_smote, y_train_tfidf_smote, X_test_tfidf, y_test_tfidf)

print()
print("Word2Vec")
# Naive Bayes does not support negative values, scale word2vec vectors
scaler = MinMaxScaler()
X_train_word2vec_smote_scaled = scaler.fit_transform(X_train_word2vec_smote)
X_test_word2vec_scaled = scaler.transform(X_test_word2vec)
nb_word2vec_smote = train_evaluate_model(MultinomialNB(), X_train_word2vec_smote_scaled, y_train_word2vec_smote, X_test_word2vec_scaled, y_test_word2vec)

print()
print("TF-IDF Weighted Word2Vec")

# Naive Bayes does not support negative values, scale tfidf weighted word2vec vectors
scaler = MinMaxScaler()
X_train_tfidf_weighted_word2vec_smote_scaled = scaler.fit_transform(X_train_tfidf_weighted_word2vec_smote)
X_test_tfidf_weighted_word2vec_scaled = scaler.transform(X_test_tfidf_weighted_word2vec)
nb_tfidf_weighted_word2vec_smote = train_evaluate_model(MultinomialNB(), X_train_tfidf_weighted_word2vec_smote_scaled, y_train_tfidf_weighted_word2vec_smote, X_test_tfidf_weighted_word2vec_scaled, y_test_tfidf_weighted_word2vec)

print()



Naive Bayes with SMOTE

Bag of Words
F1 Score: 0.33

Confusion Matrix:
[[17051  8618]
 [ 4024  3114]]

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.66      0.73     25669
           1       0.27      0.44      0.33      7138

    accuracy                           0.61     32807
   macro avg       0.54      0.55      0.53     32807
weighted avg       0.69      0.61      0.64     32807


TF-IDF
F1 Score: 0.37

Confusion Matrix:
[[15516 10153]
 [ 3160  3978]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.60      0.70     25669
           1       0.28      0.56      0.37      7138

    accuracy                           0.59     32807
   macro avg       0.56      0.58      0.54     32807
weighted avg       0.71      0.59      0.63     32807


Word2Vec
F1 Score: 0.35

Confusion Matrix:
[[15164 10505]
 [ 3401  3737]]

Classification Report:
              precision 

mark: straight training, bow vs tf vs w2v vs tfidfw2v

bow 0.32
tfidf 0.02
w2v 0.00
tfidfw2v 0.00

mark: with smote

bow: 0.33
tfidf: 0.37
w2c: 0.35
tfidfw2v: 0.35

In [11]:
# Logistic Regression without SMOTE for each feature set

print("Logistic Regression without SMOTE")

print()

print("Bag of Words")

lr_bow = train_evaluate_model(LogisticRegression(), X_train_bow, y_train_bow, X_test_bow, y_test_bow)

print()

print("TF-IDF")

lr_tfidf = train_evaluate_model(LogisticRegression(), X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf)

print()

print("Word2Vec")

lr_word2vec = train_evaluate_model(LogisticRegression(), X_train_word2vec, y_train_word2vec, X_test_word2vec, y_test_word2vec)

print()

print("TF-IDF Weighted Word2Vec")

lr_tfidf_weighted_word2vec = train_evaluate_model(LogisticRegression(), X_train_tfidf_weighted_word2vec, y_train_tfidf_weighted_word2vec, X_test_tfidf_weighted_word2vec, y_test_tfidf_weighted_word2vec)

print()



Logistic Regression without SMOTE

Bag of Words
F1 Score: 0.19

Confusion Matrix:
[[24496  1173]
 [ 6277   861]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.95      0.87     25669
           1       0.42      0.12      0.19      7138

    accuracy                           0.77     32807
   macro avg       0.61      0.54      0.53     32807
weighted avg       0.71      0.77      0.72     32807


TF-IDF
F1 Score: 0.09

Confusion Matrix:
[[25372   297]
 [ 6791   347]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.99      0.88     25669
           1       0.54      0.05      0.09      7138

    accuracy                           0.78     32807
   macro avg       0.66      0.52      0.48     32807
weighted avg       0.73      0.78      0.71     32807


Word2Vec
F1 Score: 0.01

Confusion Matrix:
[[25629    40]
 [ 7106    32]]

Classification Report:
             

In [10]:
# Logistic Regression with SMOTE for each feature set

print("Logistic Regression with SMOTE")

print()

print("Bag of Words")

lr_bow_smote = train_evaluate_model(LogisticRegression(), X_train_bow_smote, y_train_bow_smote, X_test_bow, y_test_bow)

print()

print("TF-IDF")

lr_tfidf_smote = train_evaluate_model(LogisticRegression(), X_train_tfidf_smote, y_train_tfidf_smote, X_test_tfidf, y_test_tfidf)

print()

print("Word2Vec")

lr_word2vec_smote = train_evaluate_model(LogisticRegression(), X_train_word2vec_smote, y_train_word2vec_smote, X_test_word2vec, y_test_word2vec)

print()

print("TF-IDF Weighted Word2Vec")

lr_tfidf_weighted_word2vec_smote = train_evaluate_model(LogisticRegression(), X_train_tfidf_weighted_word2vec_smote, y_train_tfidf_weighted_word2vec_smote, X_test_tfidf_weighted_word2vec, y_test_tfidf_weighted_word2vec)

print()

Logistic Regression with SMOTE

Bag of Words
F1 Score: 0.30

Confusion Matrix:
[[17877  7792]
 [ 4502  2636]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.70      0.74     25669
           1       0.25      0.37      0.30      7138

    accuracy                           0.63     32807
   macro avg       0.53      0.53      0.52     32807
weighted avg       0.68      0.63      0.65     32807


TF-IDF
F1 Score: 0.37

Confusion Matrix:
[[17324  8345]
 [ 3575  3563]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.67      0.74     25669
           1       0.30      0.50      0.37      7138

    accuracy                           0.64     32807
   macro avg       0.56      0.59      0.56     32807
weighted avg       0.71      0.64      0.66     32807


Word2Vec
F1 Score: 0.36

Confusion Matrix:
[[14920 10749]
 [ 3207  3931]]

Classification Report:
              pr

l r w/ smote

bow 0.30
tfidf 0.37

In [9]:
df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [7]:
# feature engineering: days between received and sent date, convert to int type

df['Date received'] = pd.to_datetime(df['Date received'])
df['Date sent to company'] = pd.to_datetime(df['Date sent to company'])

df['days_between_received_sent'] = (df['Date sent to company'] - df['Date received']).dt.days
df['days_between_received_sent'] = df['days_between_received_sent'].astype(int)

# drop Date received and Date sent to company columns

df.drop(columns=['Date received', 'Date sent to company'], inplace=True)



In [8]:
# Product, Subproduct, Issue, Subisses, fill na with 'Not Provided'

df['Product'].fillna('Not Provided', inplace=True)
df['Sub-product'].fillna('Not Provided', inplace=True)

df['Issue'].fillna('Not Provided', inplace=True)
df['Sub-issue'].fillna('Not Provided', inplace=True)

# feature engineering: combine Product, Subproduct as product_pubproduct
df['product_subproduct'] = df['Product'] + ' ' + df['Sub-product']

# feature engineering: combine Issue, Subissue as issue_subissue
df['issue_subissue'] = df['Issue'] + ' ' + df['Sub-issue']

# drop Product, Subproduct, Issue, Subissue columns
df.drop(columns=['Product', 'Sub-product', 'Issue', 'Sub-issue'], inplace=True)

In [9]:
df.columns

Index(['Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Company response to consumer', 'Timely response?',
       'Consumer disputed?', 'Complaint ID', 'days_between_received_sent',
       'product_subproduct', 'issue_subissue'],
      dtype='object')

In [10]:
# drop Consumer complaint narrative column, Zip code column, Consumer consent provided? column, Submitted via, Timely responses? column, Complaint ID column

df.drop(columns=['Consumer complaint narrative', 'ZIP code', 'Consumer consent provided?', 'Submitted via', 'Timely response?', 'Complaint ID'], inplace=True)

df.columns

Index(['Company public response', 'Company', 'State', 'Tags',
       'Company response to consumer', 'Consumer disputed?',
       'days_between_received_sent', 'product_subproduct', 'issue_subissue'],
      dtype='object')

In [11]:
# fill na with 'Not Provided'

df.fillna('Not Provided', inplace=True)



In [12]:
df.columns

Index(['Company public response', 'Company', 'State', 'Tags',
       'Company response to consumer', 'Consumer disputed?',
       'days_between_received_sent', 'product_subproduct', 'issue_subissue'],
      dtype='object')

In [13]:
# feature engineering: public response and response to consumer, combine them as public_response_response_to_consumer

df['public_response_response_to_consumer'] = df['Company public response'] + ' ' + df['Company response to consumer']

In [14]:
df.nunique()

Company public response                   11
Company                                 3148
State                                     63
Tags                                       4
Company response to consumer               5
Consumer disputed?                         2
days_between_received_sent               269
product_subproduct                        52
issue_subissue                           134
public_response_response_to_consumer      44
dtype: int64

In [15]:
# drop Company public response and Company response to consumer columns

df.drop(columns=['Company public response', 'Company response to consumer'], inplace=True)

df.columns

Index(['Company', 'State', 'Tags', 'Consumer disputed?',
       'days_between_received_sent', 'product_subproduct', 'issue_subissue',
       'public_response_response_to_consumer'],
      dtype='object')

In [16]:
# label encode disputed column

le = LabelEncoder()

df['Consumer disputed?'] = le.fit_transform(df['Consumer disputed?'])

y = df['Consumer disputed?']



In [17]:
# feature engineering: compute the ratio of complaints disputed to total complaints for each company

df['company_complaints_disputed_ratio'] = df.groupby('Company')['Consumer disputed?'].transform(lambda x: x.sum() / x.count())

In [18]:
# one-hot encode all categorical features

df = pd.get_dummies(df, drop_first=True)

df.columns

Index(['Consumer disputed?', 'days_between_received_sent',
       'company_complaints_disputed_ratio',
       'Company_1ST ALLIANCE LENDING, LLC',
       'Company_1ST PREFERENCE MORTGAGE CORP',
       'Company_1st Capital Mortgage, LLC',
       'Company_1st Franklin Financial Corporation',
       'Company_1st Money Center, Inc., Hurst, TX Branch',
       'Company_21ST MORTGAGE CORP.', 'Company_2233 Paradise Road LLC',
       ...
       'public_response_response_to_consumer_Company disputes the facts presented in the complaint Closed with non-monetary relief',
       'public_response_response_to_consumer_Company has responded to the consumer and the CFPB and chooses not to provide a public response Closed',
       'public_response_response_to_consumer_Company has responded to the consumer and the CFPB and chooses not to provide a public response Closed with explanation',
       'public_response_response_to_consumer_Company has responded to the consumer and the CFPB and chooses not to pr

In [19]:
# X
X = df.drop(columns=['Consumer disputed?'])

In [20]:
# concatenate X and bow features

X_bow = np.concatenate((X.values, bow.toarray()), axis=1)

X_bow.shape

(164034, 8441)

In [21]:
# concatenate X and tfidf features

X_tfidf = np.concatenate((X.values, tfidf.toarray()), axis=1)

X_tfidf.shape

(164034, 8441)

In [22]:
# smote to handle class imbalance of X_bow and X_tfidf

smote = SMOTE(random_state=42)

X_bow_smote, y_bow_smote = smote.fit_resample(X_bow, y)
X_tfidf_smote, y_tfidf_smote = smote.fit_resample(X_tfidf, y)

# print shape of X_bow_smote and X_tfidf_smote
print(X_bow_smote.shape, y_bow_smote.shape)
print(X_tfidf_smote.shape, y_tfidf_smote.shape)

(256454, 8441) (256454,)
(256454, 8441) (256454,)


In [24]:
# train test split for X_bow and X_tfidf smoted

X_train_bow_smote, X_test_bow_smote, y_train_bow_smote, y_test_bow_smote = train_test_split(X_bow_smote, y_bow_smote, test_size=0.2, random_state=42)

X_train_tfidf_smote, X_test_tfidf_smote, y_train_tfidf_smote, y_test_tfidf_smote = train_test_split(X_tfidf_smote, y_tfidf_smote, test_size=0.2, random_state=42)

# print shape of X_train_bow_smote, X_test_bow_smote, y_train_bow_smote, y_test_bow_smote
print(X_train_bow_smote.shape, X_test_bow_smote.shape, y_train_bow_smote.shape, y_test_bow_smote.shape)

# print shape of X_train_tfidf_smote, X_test_tfidf_smote, y_train_tfidf_smote, y_test_tfidf_smote
print(X_train_tfidf_smote.shape, X_test_tfidf_smote.shape, y_train_tfidf_smote.shape, y_test_tfidf_smote.shape)

(205163, 8441) (51291, 8441) (205163,) (51291,)
(205163, 8441) (51291, 8441) (205163,) (51291,)


In [25]:
# lightgbm for bow

lgbm_bow = train_evaluate_model(LGBMClassifier(), X_train_bow_smote, y_train_bow_smote, X_test_bow_smote, y_test_bow_smote)



Training the model...
LGBMClassifier
[LightGBM] [Info] Number of positive: 102570, number of negative: 102593
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.594000 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 843543
[LightGBM] [Info] Number of data points in the train set: 205163, number of used features: 6083
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499944 -> initscore=-0.000224
[LightGBM] [Info] Start training from score -0.000224
F1 Score: 0.85

Confusion Matrix:
[[25346   288]
 [ 6594 19063]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.99      0.88     25634
           1       0.99      0.74      0.85     25657

    accuracy                           0.87     51291
   macro avg       0.89      0.87      0.86     51291
weighted avg       0.89      0.87      0.86     51291



In [26]:
# lightgbm for tfidf

lgbm_tfidf = train_evaluate_model(LGBMClassifier(), X_train_tfidf_smote, y_train_tfidf_smote, X_test_tfidf_smote, y_test_tfidf_smote)



Training the model...
LGBMClassifier
[LightGBM] [Info] Number of positive: 102570, number of negative: 102593
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.157780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1037029
[LightGBM] [Info] Number of data points in the train set: 205163, number of used features: 6064
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499944 -> initscore=-0.000224
[LightGBM] [Info] Start training from score -0.000224
F1 Score: 0.82

Confusion Matrix:
[[23298  2336]
 [ 6225 19432]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.91      0.84     25634
           1       0.89      0.76      0.82     25657

    accuracy                           0.83     51291
   macro avg       0.84      0.83      0.83     51291
weighted avg       0.84      0.83      0.83     51291



In [27]:
# catboost for bow

cat_bow = train_evaluate_model(CatBoostClassifier(verbose=False, random_state=42), X_train_bow_smote, y_train_bow_smote, X_test_bow_smote, y_test_bow_smote)



Training the model...
CatBoostClassifier
F1 Score: 0.85

Confusion Matrix:
[[25101   533]
 [ 6325 19332]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.88     25634
           1       0.97      0.75      0.85     25657

    accuracy                           0.87     51291
   macro avg       0.89      0.87      0.86     51291
weighted avg       0.89      0.87      0.86     51291



In [28]:
# catboost for tfidf

cat_tfidf = train_evaluate_model(CatBoostClassifier(verbose=False, random_state=42), X_train_tfidf_smote, y_train_tfidf_smote, X_test_tfidf_smote, y_test_tfidf_smote)

Training the model...
CatBoostClassifier
F1 Score: 0.84

Confusion Matrix:
[[23893  1741]
 [ 5997 19660]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.93      0.86     25634
           1       0.92      0.77      0.84     25657

    accuracy                           0.85     51291
   macro avg       0.86      0.85      0.85     51291
weighted avg       0.86      0.85      0.85     51291



In [29]:
# xgboost for bow

xgb_bow = train_evaluate_model(XGBClassifier(random_state=42), X_train_bow_smote, y_train_bow_smote, X_test_bow_smote, y_test_bow_smote)

Training the model...
XGBClassifier
F1 Score: 0.85

Confusion Matrix:
[[25109   525]
 [ 6396 19261]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.88     25634
           1       0.97      0.75      0.85     25657

    accuracy                           0.87     51291
   macro avg       0.89      0.87      0.86     51291
weighted avg       0.89      0.87      0.86     51291



In [30]:
# xgboost for tfidf

xgb_tfidf = train_evaluate_model(XGBClassifier(random_state=42), X_train_tfidf_smote, y_train_tfidf_smote, X_test_tfidf_smote, y_test_tfidf_smote)


Training the model...
XGBClassifier
F1 Score: 0.82

Confusion Matrix:
[[23321  2313]
 [ 6075 19582]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.91      0.85     25634
           1       0.89      0.76      0.82     25657

    accuracy                           0.84     51291
   macro avg       0.84      0.84      0.84     51291
weighted avg       0.84      0.84      0.84     51291



bow nb: 0.62<br>
tfidf nb: 0.66<br>

bow lr: 0.68<br>
tfidf lr: 0.67<br>

bow light 0.85<br>
tfidf light 0.82<br>

bow cat: 0.85<br>
tfidf cat: 0.84<br>

bow xgb: 0.85<br>
tfidf xgb: 0.82<br>

## deep learning

In [31]:
# train a fully connected neural network ANN for bow

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train_bow_smote.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(X_train_bow_smote, y_train_bow_smote, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=0)

# plot loss and accuracy

plt.figure(figsize=(12, 8))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()

# predict on test data
y_pred = model.predict_classes(X_test_bow_smote)

# print f1 score
print("F1 Score: {:.2f}".format(f1_score(y_test_bow_smote, y_pred)))

# print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_bow_smote, y_pred))



In [None]:
# wrap the model in a function

def train_evaluate_ann(X_train, y_train, X_test, y_test):

    # define model
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    
    # fit model
    history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=0)
    
    # plot loss and accuracy
    plt.figure(figsize=(12, 8))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.legend()
    plt.show()
    
    # predict on test data
    y_pred = model.predict_classes(X_test)
    
    # print f1 score
    print("F1 Score: {:.2f}".format(f1_score(y_test, y_pred)))
    
    # print confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return model

# train and evaluate ANN for tfidf

ann_tfidf = train_evaluate_ann(X_train_tfidf_smote, y_train_tfidf_smote, X_test_tfidf_smote, y_test_tfidf_smote)