In [22]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# Text representation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# ML Algo
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Evaluvation
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

from time import time

In [2]:
data_path = '../../data/train.csv'

In [3]:
df = pd.read_csv(data_path)

In [4]:
df.shape

(405984, 5)

In [5]:
df.columns

Index(['Unnamed: 0', 'index', 'Unnamed: 0.1', 'full_text', 'label'], dtype='object')

In [6]:
df.head()

Unnamed: 0.2,Unnamed: 0,index,Unnamed: 0.1,full_text,label
0,0,0,0,transworld systems inc. \nis trying to collect...,Debt collection
1,1,2,2,"Over the past 2 weeks, I have been receiving e...",Debt collection
2,2,3,3,Pioneer has committed several federal violatio...,Debt collection
3,3,4,5,Problem with a credit reporting company's inve...,"Credit reporting, credit repair services, or o..."
4,4,5,7,Incorrect information on your report-Account i...,"Credit reporting, credit repair services, or o..."


In [7]:
df.drop(columns=['Unnamed: 0', 'index', 'Unnamed: 0.1'], inplace=True)

In [8]:
df.columns

Index(['full_text', 'label'], dtype='object')

In [9]:
df.head()

Unnamed: 0,full_text,label
0,transworld systems inc. \nis trying to collect...,Debt collection
1,"Over the past 2 weeks, I have been receiving e...",Debt collection
2,Pioneer has committed several federal violatio...,Debt collection
3,Problem with a credit reporting company's inve...,"Credit reporting, credit repair services, or o..."
4,Incorrect information on your report-Account i...,"Credit reporting, credit repair services, or o..."


In [10]:
# Class distribution
df['label'].value_counts()/df.shape[0] * 100

Credit reporting, credit repair services, or other personal consumer reports    30.562288
Debt collection                                                                 21.325964
Mortgage                                                                        12.170923
Credit card or prepaid card                                                      7.813608
Credit reporting                                                                 5.164489
Student loan                                                                     4.716196
Checking or savings account                                                      4.611512
Credit card                                                                      3.244217
Bank account or service                                                          2.545913
Money transfer, virtual currency, or money service                               1.977418
Vehicle loan or lease                                                            1.946136
Consumer L

In [11]:
X = df.full_text
y = df.label

In [12]:
print(X.shape, y.shape)

(405984,) (405984,)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(304488,) (304488,)
(101496,) (101496,)


In [16]:
import string
import re

In [17]:
def clean(doc): #doc is a string of text
    doc = doc.replace("</br>", " ") #This text contains a lot of <br/> tags.
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    #remove punctuation and numbers
    return doc

In [23]:
# Creating pipeline for Naive Bayes Model
pipeline_mnb = Pipeline(steps = [('TfIdf', TfidfVectorizer(preprocessor=clean,
                                                           stop_words='english')),
                              ('MultinomialNB', MultinomialNB())])

In [24]:
# Parameter values to test
param_grid = {
 'TfIdf__max_features' : [5000, 6000, 7000],
 'TfIdf__ngram_range' : [(1,1)],
 'TfIdf__use_idf' : [True],
 'MultinomialNB__alpha' : [0.01, 0.02, 0.05, 0.10]
}

In [25]:
grid_search_mnb = GridSearchCV(pipeline_mnb, param_grid, cv=5, verbose=1, n_jobs=6)

In [None]:
grid_search_mnb.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


In [None]:
print(grid_search_mnb.best_params_)

In [None]:
print(grid_search_mnb.best_estimator_)

In [None]:
grid_search_mnb.score(X_test, y_test)

In [None]:
y_predicted = grid_search_mnb.predict(X_test)

In [None]:
classification_report_mnb = classification_report(y_test, y_predicted)
print(classification_report_mnb)