# Predictive Modeling - Binary Classification

<i>Author: Jamell Dacon</i>
    
There are several supervised models used such as Gradient Boosting (GB), Random Forest (RF), Multi-Layer Perceptron (MLP), Logistic Regression (LGR) and Decision Trees (DT).

In [1]:
# Load in models and packages necessary for datasets
# Importing individual libraries make it easy for us to use them without having to call the parent libraries
import math
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

# Visualization libraries
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

# Managing Warnings i.e. Deprecation warnings
import warnings
warnings.filterwarnings('ignore')

## Binary

### Training Set

In [2]:
train = pd.read_json("data/toxic_train.json")
train.shape

(7944, 2)

In [3]:
train_label = train['label'].value_counts()
train_label

1    5967
0    1977
Name: label, dtype: int64

### Testing Set

In [4]:
test = pd.read_json("data/toxic_test.json")
test.shape

(1986, 2)

In [5]:
test_label = test['label'].value_counts()
test_label

1    1492
0     494
Name: label, dtype: int64

### Train/ Test Splits 

In [6]:
# Training set 
X_train = train['text']
y_train = train['label']

# Test set 
X_test = test['text']
y_test = test['label']

### Decision Tree

In [7]:
dt = Pipeline([('tfidf', TfidfVectorizer()),('clf', DecisionTreeClassifier(random_state = 42))]) 
dt_model = dt.fit(X_train, y_train)

# Model Prediction 
dt_preds = dt_model.predict(X_test)

# Print the classification report of the rf model
print(classification_report(y_true = y_test, y_pred = dt_preds))

#Print the confusion matrix of the rf model
print('\nConfusion Matrix:\n', confusion_matrix(y_test,dt_preds))

              precision    recall  f1-score   support

           0       0.47      0.45      0.46       494
           1       0.82      0.83      0.82      1492

    accuracy                           0.74      1986
   macro avg       0.64      0.64      0.64      1986
weighted avg       0.73      0.74      0.73      1986


Confusion Matrix:
 [[ 224  270]
 [ 255 1237]]


### Random Forest Classifier model

In [8]:
rf = Pipeline([('tfidf', TfidfVectorizer()),('clf', RandomForestClassifier(random_state = 42))]) 
rf_model = rf.fit(X_train, y_train)

# Model Prediction 
rf_preds = rf_model.predict(X_test)

# Print the classification report of the rf model
print(classification_report(y_true = y_test, y_pred = rf_preds))

#Print the confusion matrix of the rf model
print('\nConfusion Matrix:\n', confusion_matrix(y_test,rf_preds))

              precision    recall  f1-score   support

           0       0.61      0.36      0.45       494
           1       0.81      0.92      0.87      1492

    accuracy                           0.78      1986
   macro avg       0.71      0.64      0.66      1986
weighted avg       0.76      0.78      0.76      1986


Confusion Matrix:
 [[ 176  318]
 [ 112 1380]]


### Gradient Boosting Classifier model

In [9]:
gb = Pipeline([('tfidf', TfidfVectorizer()),('clf', GradientBoostingClassifier(random_state = 42))])
gb_model = gb.fit(X_train, y_train)

# Model Prediction 
gb_preds = gb_model.predict(X_test)

# Print the classification report of the rf model
print(classification_report(y_true = y_test, y_pred = gb_preds))

#Print the confusion matrix of the rf model
print('\nConfusion Matrix:\n', confusion_matrix(y_test,gb_preds))

              precision    recall  f1-score   support

           0       0.75      0.31      0.44       494
           1       0.81      0.97      0.88      1492

    accuracy                           0.80      1986
   macro avg       0.78      0.64      0.66      1986
weighted avg       0.80      0.80      0.77      1986


Confusion Matrix:
 [[ 155  339]
 [  51 1441]]


### MLP Classifier model

In [10]:
mlp = Pipeline([('tfidf', TfidfVectorizer()),('clf',MLPClassifier(hidden_layer_sizes = (15,15,15), max_iter = 1000))])
mlp.fit(X_train,y_train)

# Model Prediction 
mlp_preds = mlp.predict(X_test)

# Print the classification report of the rf model
print(classification_report(y_true = y_test, y_pred = mlp_preds))

# Print the confusion matrix of the rf model
print('\nConfusion Matrix:\n', confusion_matrix(y_test,mlp_preds))

              precision    recall  f1-score   support

           0       0.63      0.59      0.61       494
           1       0.87      0.89      0.88      1492

    accuracy                           0.81      1986
   macro avg       0.75      0.74      0.75      1986
weighted avg       0.81      0.81      0.81      1986


Confusion Matrix:
 [[ 293  201]
 [ 169 1323]]


### Logistic Regression model

In [11]:
lgr = Pipeline([('tfidf', TfidfVectorizer()),('clf', LogisticRegression(random_state = 42))])
lgr_model = lgr.fit(X_train, y_train)

# Model Prediction 
lgr_preds = lgr.predict(X_test)

# Print the classification report of the rf model
print(classification_report(y_true = y_test, y_pred = lgr_preds))

# Print the confusion matrix of the rf model
print('\nConfusion Matrix:\n', confusion_matrix(y_test,lgr_preds))

              precision    recall  f1-score   support

           0       0.82      0.49      0.61       494
           1       0.85      0.97      0.90      1492

    accuracy                           0.85      1986
   macro avg       0.84      0.73      0.76      1986
weighted avg       0.84      0.85      0.83      1986


Confusion Matrix:
 [[ 240  254]
 [  52 1440]]


### Support Vector Machine

In [12]:
svm = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC(random_state = 42))])
svm_model = svm.fit(X_train, y_train)

# Model Prediction 
svm_preds = svm.predict(X_test)

# Print the classification report of the rf model
print(classification_report(y_true = y_test, y_pred = svm_preds))

# Print the confusion matrix of the rf model
print('\nConfusion Matrix:\n', confusion_matrix(y_test,svm_preds))

              precision    recall  f1-score   support

           0       0.74      0.61      0.67       494
           1       0.88      0.93      0.90      1492

    accuracy                           0.85      1986
   macro avg       0.81      0.77      0.79      1986
weighted avg       0.84      0.85      0.84      1986


Confusion Matrix:
 [[ 303  191]
 [ 109 1383]]
