# import packages

In [2]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# Text representation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# ML Algo
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Evaluvation
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

from time import time

In [15]:
import string
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

In [19]:
from scripts.utils import plotConfusionMatrixHeatmap

# Load Trianing file

In [3]:
data_path = '../../data/train.csv'

In [4]:
df = pd.read_csv(data_path)

In [5]:
df.shape

(405735, 4)

In [6]:
df.columns

Index(['Unnamed: 0', 'index', 'full_text', 'label'], dtype='object')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,full_text,label
0,0,1,Incorrect information on your report-Informati...,Credit reporting
1,1,2,"Over the past 2 weeks, I have been receiving e...",Debt collection
2,2,3,Pioneer has committed several federal violatio...,Debt collection
3,3,8,"Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/...",Credit reporting
4,4,11,Closing your account-Company closed your account,Credit card or prepaid card


In [8]:
df.drop(columns=['Unnamed: 0', 'index'], inplace=True)

In [9]:
df.columns

Index(['full_text', 'label'], dtype='object')

In [10]:
df.head()

Unnamed: 0,full_text,label
0,Incorrect information on your report-Informati...,Credit reporting
1,"Over the past 2 weeks, I have been receiving e...",Debt collection
2,Pioneer has committed several federal violatio...,Debt collection
3,"Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/...",Credit reporting
4,Closing your account-Company closed your account,Credit card or prepaid card


In [12]:
df['label'].value_counts()

Credit reporting                 144663
Debt collection                   86653
Mortgage                          49406
Credit card or prepaid card       45740
Loans                             40921
Account service                   29044
Money transfer, VC and Others      9308
Name: label, dtype: int64

In [11]:
# Class distribution
df['label'].value_counts()/df.shape[0] * 100

Credit reporting                 35.654553
Debt collection                  21.357043
Mortgage                         12.176914
Credit card or prepaid card      11.273368
Loans                            10.085647
Account service                   7.158367
Money transfer, VC and Others     2.294108
Name: label, dtype: float64

# Pre Processing

In [13]:
def clean_text(doc):
    """
      1. Converting all text into lower case
      2. Removing classified words like xxx
      3. Remove stop words
      4. remove punctuation
      5. remove digits
      6. Wordnet lemmatizer
      """
    # Set stop word as english
    stop_word = set(stopwords.words('english'))
    
    # Tokenize the sentence and make all character lower case
    doc = [x.lower() for x in word_tokenize(doc)]
    
    # Remove classified texts
    doc = [x for x in doc if x.lower() != 'xxxx' and x.lower() != 'xx' and x.lower() != 'xx/xx/xxxx']
    
    # Remove stop words
    doc = [x for x in doc if x not in stop_word]
    
    # Remove Punctuation
    doc = [x for x in doc if x not in string.punctuation]
    
    # Remove Digits
    doc = [x for x in doc if not x.isdigit()]
    
    # Set NLTK Wordnet lemmatizer and lemmatize the sentence
    lemmatizer = WordNetLemmatizer()
    doc = " ".join([lemmatizer.lemmatize(word) for word in doc])
    
    return doc

In [16]:
df['text_processed'] = df.apply(lambda row : clean_text(row['full_text']), axis = 1)

In [17]:
df.head()

Unnamed: 0,full_text,label,text_processed
0,Incorrect information on your report-Informati...,Credit reporting,incorrect information report-information belon...
1,"Over the past 2 weeks, I have been receiving e...",Debt collection,past week receiving excessive amount telephone...
2,Pioneer has committed several federal violatio...,Debt collection,pioneer committed several federal violation pr...
3,"Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/...",Credit reporting,previously requested experian send copy verifi...
4,Closing your account-Company closed your account,Credit card or prepaid card,closing account-company closed account


# Encoding and Modeling

In [20]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [21]:
df['label_id'] = label_encoder.fit_transform(df['label'])

In [22]:
# Put the label category into dict for future use
label_map = df.set_index('label_id').to_dict()['label']
label_map

{2: 'Credit reporting',
 3: 'Debt collection',
 1: 'Credit card or prepaid card',
 0: 'Account service',
 6: 'Mortgage',
 4: 'Loans',
 5: 'Money transfer, VC and Others'}

In [23]:
df.head()

Unnamed: 0,full_text,label,text_processed,label_id
0,Incorrect information on your report-Informati...,Credit reporting,incorrect information report-information belon...,2
1,"Over the past 2 weeks, I have been receiving e...",Debt collection,past week receiving excessive amount telephone...,3
2,Pioneer has committed several federal violatio...,Debt collection,pioneer committed several federal violation pr...,3
3,"Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/...",Credit reporting,previously requested experian send copy verifi...,2
4,Closing your account-Company closed your account,Credit card or prepaid card,closing account-company closed account,1


In [24]:
X = df.text_processed
y = df.label_id

In [25]:
print(X.shape, y.shape)

(405735,) (405735,)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [27]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(304301,) (304301,)
(101434,) (101434,)


# Multinomial Naive bayes

In [28]:
# Parameter values to test
param_grid = {
    'TfIdf__max_features' : [5000, 10000, 20000, 25000],
    'TfIdf__ngram_range' : [(1,1),(1,2),(2,2)],
    'TfIdf__use_idf' : [True],
    'MultinomialNB__alpha' : [0.01, 0.02, 0.05, 0.10]
    }

In [29]:
# Creating pipeline for Naive Bayes Model
pipeline_mnb = Pipeline(steps = [('TfIdf', TfidfVectorizer()),
                              ('MultinomialNB', MultinomialNB())])

In [30]:
grid_search_mnb = GridSearchCV(pipeline_mnb, param_grid, cv=5, verbose=1, n_jobs=6)

In [None]:
grid_search_mnb.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed: 19.1min


In [None]:
print(grid_search_mnb.best_params_)

In [None]:
print(grid_search_mnb.best_estimator_)

In [None]:
grid_search_mnb.score(X_test, y_test)

In [None]:
y_predicted = grid_search_mnb.predict(X_test)

In [None]:
classification_report_mnb = classification_report(y_test, y_predicted)
print(classification_report_mnb)

In [None]:
key_to_label_name = [x[1] for x in sorted(label_map.items())]

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_predicted),
                              index=key_to_label_name, columns=key_to_label_name)

plotConfusionMatrixHeatmap(conf_matrix_df, model_name='Multinomial Naive bayes', figsize=(12, 10))

In [None]:
best_model = grid_search_mnb
best_model.version = 1.0
best_model.pandas_version = pd.__version__
best_model.numpy_version = np.__version__
best_model.sklearn_version = sklearn_version
best_model.build_datetime = datetime.now()
    
modelpath = '../../data/models'
if not os.path.exists(modelpath):
    os.mkdir(modelpath)
mnbmodel_path = os.path.join(modelpath, 'Multinomial_naive_bayes_with_7_class.pkl')
if not os.path.exists(mnbmodel_path):
    with open(mnbmodel_path, 'wb') as f:
        pickle.dump(best_model, f)

# Logistic Regression

In [None]:
param_grid = {
    'TfIdf__max_features' : [5000, 10000, 20000, 25000],
    'TfIdf__ngram_range' : [(1,1),(1,2),(2,2)],
    'TfIdf__use_idf' : [True]
    }

In [None]:
# Creating pipeline for Logistice Regression model
pipeline_lr = Pipeline(steps = [('TfIdf', TfidfVectorizer()),
                              ('LogisticRegression', LogisticRegression(class_weight="balanced"))])

In [None]:
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, verbose=1, n_jobs=-1)

In [None]:
grid_search_lr.fit(X_train, y_train)

In [None]:
print(grid_search_lr.best_params_)

In [None]:
print(grid_search_lr.best_estimator_)

In [None]:
grid_search_lr.score(X_test, y_test)

In [None]:
y_predicted = grid_search_lr.predict(X_test)

In [None]:
classification_report_lr = classification_report(y_test, y_predicted)
print(classification_report_lr)

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_predicted),
                              index=key_to_label_name, columns=key_to_label_name)

plotConfusionMatrixHeatmap(conf_matrix_df, model_name='Multinomial', figsize=(12, 10))

# Random forest clasifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = TfidfVectorizer(min_df=3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))

In [None]:
pipeline_rf = Pipeline(steps = [('countvectorizer', vectorizer),
                              ('clf', RandomForestClassifier())])

In [None]:
model = pipeline_rf.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred_prob = model.predict_proba(X_test)

In [None]:
lr_probs = y_pred_prob[:,1]

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_pred),index=key_to_label_name,
                              columns=key_to_label_name)

In [None]:
classification_rep = classification_report(y_test, y_pred,target_names=key_to_label_name)
print(classification_rep)

In [None]:
plotConfusionMatrixHeatmap(conf_matrix_df, model_name='Random forest', figsize=(12, 10))

# Doc 2 Vec with logistic regression

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
#prepare training data in doc2vec format:
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(X_train)]

In [None]:
#Train a doc2vec model to learn
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("../../data/models/d2v.model")
print("Model Saved")

In [None]:
#Infer the feature representation for training and test data using the trained model
model= Doc2Vec.load("../../data/models/d2v.model")

In [None]:
#infer in multiple steps to get a stable representation. 
train_vectors =  [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in X_train]
test_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in X_test]

In [None]:
clf = LogisticRegression(class_weight="balanced")
clf.fit(train_vectors, y_train)