#### Importing packages

In [3]:
import pandas as pd
import numpy as np
from scripts.utils import tokenize_sentence, lemmatize_sentence, save_as_pickle, plotConfusionMatrixHeatmap
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Preprocessing & Model Preparation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Performance Evaluation
from sklearn.metrics import confusion_matrix, classification_report

#### Loading data set

In [4]:
# loading data into pandas dataframe
file_location = '../../data/support_ticket.csv'

tickets_df= pd.read_csv(file_location)

In [5]:
# Shape of the dataframe
tickets_df.shape

(579977, 12)

In [7]:
# Columns in the dataframe
tickets_df.columns

Index(['Unnamed: 0', 'full_text', 'submitted_via', 'sub_label', 'label',
       'word_count', 'unique_word_count', 'stop_word_count', 'url_count',
       'mean_word_length', 'char_count', 'punctuation_count'],
      dtype='object')

#### Preprocessing the data

In [None]:
# Tokenize the ful_text column
tickets_df['full_text_tokenized'] = tickets_df.apply(lambda row : tokenize_sentence(row['full_text']), axis = 1)

In [None]:
# Lemmatize the tokenized column
tickets_df['full_text_lemmatized'] = tickets_df.apply(lambda row : lemmatize_sentence(row['full_text_tokenized']), axis = 1)

#### Save processed data onto pickle file

In [None]:
# Saving preprocessed data into pkl file for future processing
save_as_pickle('data_processed.pkl', '../../data/', tickets_df)

#### Load Pickel file

In [None]:
# Loading PKL file as pandas dataframe
df = pd.read_pickle(os.path.join('../../data/', 'data_processed.pkl'))

In [None]:
df.shape

#### Encoding the label

In [None]:
# Encodeing label into numerical category
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
df['label_id'] = label_encoder.fit_transform(df['label']

In [None]:
# Put the label category into dict for future use
label_map = df.set_index('label_id').to_dict()['label']
label_map

#### Split data into validation/train/test

In [None]:
validation_data = df.sample(frac=0.20)

In [None]:
df.drop(validation_data.index, inplace=True)

In [None]:
print(validation_data.shape)

In [None]:
print(df.shape)

In [None]:
print(df.columns)

In [None]:
# Select X & y data from dataframe
X = df['full_text_lemmatized']
y = df['label_id']

In [None]:
# Split train test data with 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Check how many observation on each categories
y_train.groupby(y_train).count()

#### Visualize the train test data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Plot the observation to visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 8))
sns.barplot(x=sorted(y_train.unique()), y=y_train.groupby(y_train).count(), ax=ax1).set_title('Number of Data - Training Set')
sns.barplot(x=sorted(y_test.unique()), y=y_test.groupby(y_test).count(), ax=ax2).set_title('Number of data - Test Set')
fig.tight_layout()
plt.show()

#### Method 1: Multinomial Naive Bayes Model

In [None]:
# https://towardsdatascience.com/multinomial-naive-bayes-classifier-for-text-analysis-python-8dd6825ece67
# https://www.freecodecamp.org/news/how-to-extract-keywords-from-text-with-tf-idf-and-pythons-scikit-learn-b2a0f3d7e667/
# https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Creating pipeline for Naive Bayes Model
pipeline_mnb = Pipeline(steps = [('TfIdf', TfidfVectorizer()),
                              ('MultinomialNB', MultinomialNB())])

In [None]:
# Parameter values to test
param_grid = {
 'TfIdf__max_features' : [5000, 6000, 7000],
 'TfIdf__ngram_range' : [(1,1)],
 'TfIdf__use_idf' : [True],
 'MultinomialNB__alpha' : [0.01, 0.02, 0.05, 0.10]
}

In [None]:
grid_search_mnb = GridSearchCV(pipeline_mnb, param_grid, cv=5, verbose=1, n_jobs=6)

In [None]:
grid_search_mnb.fit(X_train, y_train)

In [None]:
print(grid_search_mnb.best_params_)

In [None]:
print(grid_search_mnb.best_estimator_)

In [None]:
grid_search_mnb.score(X_test, y_test)

In [None]:
predicted = grid_search_mnb.predict(X)
df['Predicted_Category_MNB'] = predicted

In [None]:
y_predicted = grid_search_mnb.predict(X_test)

In [None]:
key_to_label_name = [x[1] for x in sorted(label_map.items())]

In [None]:
classification_report_mnb = classification_report(y_test, y_predicted, target_names=key_to_label_name)
print(classification_report_mnb)

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_predicted),
                              index=key_to_label_name, columns=key_to_label_name)

plotConfusionMatrixHeatmap(conf_matrix_df, model_name='Multinomial Naive Bayes', figsize=(12, 10))

#### Method 2 : Linear support vector machine with SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
# https://michael-fuchs-python.netlify.app/2019/11/11/introduction-to-sgd-classifier/
pipeline_lsvm = Pipeline(steps= [('TfIdf', TfidfVectorizer()),
                                 ('SGDC', SGDClassifier(verbose=1, random_state=42))])

In [None]:
# Parameter values to test
param_grid = {
 'TfIdf__max_features' : [None, 200, 300, 400],
 'TfIdf__ngram_range' : [(1,1)],
 'TfIdf__use_idf' : [True],
 'SGDC__loss' : ['hinge'],
 'SGDC__alpha' : [0.001, 0.01, 0.05, 0.1]
}

In [None]:
grid_search_svc = GridSearchCV(pipeline_lsvm, param_grid, cv=10, verbose=1, n_jobs=6)

In [None]:
print(grid_search_svc.best_params_)

In [None]:
# Check the score on the training and test sets
grid_search_svc.score(X_test, y_test)

In [None]:
predicted = grid_search_svc.predict(X)
df['Predicted_Category_LSVM'] = predicted

In [None]:
y_predicted = grid_search_svc.predict(X_test)

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_predicted),index=key_to_label_name,
                              columns=key_to_label_name)

In [None]:
classification_rep = classification_report(y_test, y_predicted,target_names=key_to_label_name)
print(classification_rep)

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_predicted),
                              index=key_to_label_name, columns=key_to_label_name)

plotConfusionMatrixHeatmap(conf_matrix_df, model_name='Linear SVM', figsize=(12, 10))

#### Optimize Label Categories

In [None]:
df_optimize = pd.read_pickle(os.path.join('../../data/', 'data_processed.pkl'))

In [None]:
df_optimize['label'].value_counts()

In [None]:
label_category_map = {'Credit reporting, credit repair services, or other personal consumer reports': 'Credit reporting',
                      'Credit card': 'Creditcard or prepaid card',
                      'Payday loan': 'Payday loan, title loan, or personal loan',
                      'Prepaid card': 'Credit card or prepaid card',
                      'Money transfers': 'Money transfer, virtual currency, or money service',
                      'Virtual currency': 'Money transfer, virtual currency, or money service'}

In [None]:
df_optimize['label'].replace(label_category_map, inplace=True)

In [None]:
df_optimize['label'].value_counts()

In [None]:
validation_data_optimize = df_optimize.sample(frac=0.20)

In [None]:
# Select X & y data from dataframe
X = df_optimize['full_text_lemmatized']
y = df_optimize['label_id']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Method 3 : Multinomial Naive Bayes Model With optimize categories

In [None]:
param_grid = {
 'TfIdf__max_features' : [5000, 6000, 7000],
 'TfIdf__ngram_range' : [(1,1)],
 'TfIdf__use_idf' : [True],
 'MultinomialNB__alpha' : [0.01, 0.02, 0.05, 0.10]
}

In [None]:
grid_search_mnb = GridSearchCV(pipeline_mnb, param_grid, cv=5, verbose=1, n_jobs=6)

In [None]:
grid_search_mnb.fit(X_train, y_train)

In [None]:
print(grid_search_mnb.best_params_)

In [None]:
print(grid_search_mnb.best_estimator_)

In [None]:
grid_search_mnb.score(X_test, y_test)

In [None]:
predicted = grid_search_mnb.predict(X)
df_optimize['Predicted_Category_MNB'] = predicted

In [None]:
y_predicted = grid_search_mnb.predict(X_test)

In [None]:
key_to_label_name = [x[1] for x in sorted(label_map.items())]

In [None]:
classification_report_mnb = classification_report(y_test, y_predicted, target_names=key_to_label_name)
print(classification_report_mnb)

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_predicted),
                              index=key_to_label_name, columns=key_to_label_name)

plotConfusionMatrixHeatmap(conf_matrix_df, model_name='Multinomial Naive Bayes', figsize=(12, 10))

#### method 4 : Linear support vector machine with SGDClassifier optimize categories

In [None]:
param_grid = {
 'TfIdf__max_features' : [None, 200, 300, 400],
 'TfIdf__ngram_range' : [(1,1)],
 'TfIdf__use_idf' : [True],
 'SGDC__loss' : ['hinge'],
 'SGDC__alpha' : [0.001, 0.01, 0.05, 0.1]
}

In [None]:
grid_search_svc = GridSearchCV(pipeline_lsvm, param_grid, cv=10, verbose=1, n_jobs=6)

In [None]:
grid_search_svc.fit(X_train, y_train)

In [None]:
print(grid_search_svc.best_params_)

In [None]:
# Check the score on the training and test sets
grid_search_svc.score(X_test, y_test)

In [None]:
predicted = grid_search_svc.predict(X)
df_optimize['Predicted_Category_LSVM'] = predicted

In [None]:
y_predicted = grid_search_svc.predict(X_test)

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_predicted),index=key_to_label_name,
                              columns=key_to_label_name)

In [None]:
classification_rep = classification_report(y_test, y_predicted,target_names=key_to_label_name)
print(classification_rep)

In [None]:
conf_matrix_df = pd.DataFrame(data=confusion_matrix(y_test, y_predicted),
                              index=key_to_label_name, columns=key_to_label_name)

plotConfusionMatrixHeatmap(conf_matrix_df, model_name='Linear SVM', figsize=(12, 10))