# Imports


In [210]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from statistics import mean

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Cleaning

We go through a proccess of removing non-words and stopwords, lemmatizing, and filling in missing values.

### Getting Rid Of Missing Values and Filling in Missing Values

In [211]:
df = pd.read_csv('finaldata_label.csv')

In [212]:
df.isnull().sum()

Unnamed: 0           0
Unnamed: 0.1         0
date                 0
user                 0
content              0
id                   0
user_location    10031
keyword              0
username             0
verified         33573
duplicate            0
relevant             0
dtype: int64

In [213]:
df.verified = df.verified.fillna(value=0)

In [214]:
df.isnull().sum()

Unnamed: 0           0
Unnamed: 0.1         0
date                 0
user                 0
content              0
id                   0
user_location    10031
keyword              0
username             0
verified             0
duplicate            0
relevant             0
dtype: int64

In [215]:
df.user_location = df.user_location.fillna(value = 'CA')

### Lemmetizing, Stopwords and Non-Words 

We clean tweets by removing non words, lemmatizing and deleting stopwords. This was done inorder to over all improve model fitting and increase metrics. Removing such words significantly increased model performance.

In [216]:
def remove_non_words(text):
    return re.sub(r'\s+', ' ', re.sub('[^A-Za-z]', ' ', text.strip().lower())).strip()
    # cleaning code from https://www.kaggle.com/shyambhu/sentiment-classification-using-lstm
    
def lemmatize(text):
    text_lst = []
    wordnet_lemmatizer = WordNetLemmatizer()
    sentence_words = nltk.word_tokenize(text)
    punctuation = '?:!.,;'
    for word in sentence_words:
        if word in punctuation:
            sentence_words.remove(word)
    text_lst = [wordnet_lemmatizer.lemmatize(word) for word in sentence_words]
    return " ".join(text_lst)


def delete_stopwords(words):
    sentence_words = nltk.word_tokenize(words)
    stops = stopwords.words('english')
    new_phrase = []
    if len(sentence_words) > 1:
        for x in sentence_words:
            if x.lower() not in stops:
                new_phrase.append(x)
        return " ".join(new_phrase)
    else:
        return words

In [217]:
df.content = df.content.map(remove_non_words)
df.content = df.content.map(lemmatize)
df.content = df.content.map(delete_stopwords)

# Preproccessing and Modeling 

We define target column, $Y$ as the labels 'relavent' and 'non-relavent' and $X$.

In [218]:
X=df['content']

In [219]:
y = df['relevant']

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=42)

## Multinomial Naive Bayes: TD-IDF
We use the Multinomial Naive Bayes method:

$$\log p(C_k) + \sum_{i=1}^n x_i \cdot \log p_{k_i}$$

to model our training data.

We create a pipline and dictionary of parameters we want to frid search.

In [221]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [222]:
clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring='accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', MultinomialNB())]),
             param_grid={'clf__alpha': [1, 0.1, 0.01],
                         'tfidf__norm': ('l1', 'l2'),
                         'tfidf__use_idf': (True, False),
                         'vect__ngram_range': [(1, 1), (1, 2), (2, 2)]},
             scoring='accuracy')

Create a classification report and print out best hyperparameters and estimators.

In [223]:
report_test = classification_report(y_test, clf.predict(X_test), digits=4, output_dict=True)

In [224]:
clf.best_score_, clf.best_estimator_, clf.best_params_

(0.934631288766368,
 Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 2))),
                 ('tfidf', TfidfTransformer(use_idf=False)),
                 ('clf', MultinomialNB(alpha=0.1))]),
 {'clf__alpha': 0.1,
  'tfidf__norm': 'l2',
  'tfidf__use_idf': False,
  'vect__ngram_range': (1, 2)})

### Create a dataframe from the classification report and color the table for test and train data.

In [225]:
classification_report_df_test = pd.DataFrame(report_test).T

Train Data: Model performed overall greater than $80$ for all metrics percision, recall and accuracy.

In [226]:
classification_report_df_test['support'] = classification_report_df_test.support.apply(int)

classification_report_df_test.style.background_gradient(cmap='coolwarm',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])


Unnamed: 0,precision,recall,f1-score,support
0,0.949448,0.965952,0.957629,16741
1,0.879569,0.828623,0.853336,5024
accuracy,0.934252,0.934252,0.934252,0
macro avg,0.914509,0.897287,0.905482,21765
weighted avg,0.933318,0.934252,0.933555,21765


Train Data: Model performed overall greater than $90$ for all metrics percision, recall and accuracy. Fitted the data over pretty well.

In [227]:
report_train = classification_report(y_train, clf.predict(X_train), digits=4, output_dict=True)

In [228]:
classification_report_df_train = pd.DataFrame(report_train).T

In [229]:
classification_report_df_train['support'] = classification_report_df_train.support.apply(int)

classification_report_df_train.style.background_gradient(cmap='coolwarm',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])


Unnamed: 0,precision,recall,f1-score,support
0,0.994118,0.992884,0.9935,66892
1,0.976495,0.980514,0.9785,20168
accuracy,0.990018,0.990018,0.990018,0
macro avg,0.985306,0.986699,0.986,87060
weighted avg,0.990035,0.990018,0.990026,87060


In [230]:
clf.best_score_

0.934631288766368

## Random Forest Classifier: TDF-IDF

We chose to do a random forest classifier becuase on classification data, these model yield a high accuracy and pecision score.

In [231]:
text_rfc = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('rfc', RandomForestClassifier())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1)],
    'tfidf__use_idf': [True],
    'tfidf__norm': ['l1'],
    'rfc__n_estimators': [200],
    'rfc__max_features': ['auto']
}

In [232]:
rfc = GridSearchCV(estimator = text_rfc, param_grid = tuned_parameters, cv=2, scoring='accuracy',return_train_score=True)
rfc.fit(X_train, y_train)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('rfc', RandomForestClassifier())]),
             param_grid={'rfc__max_features': ['auto'],
                         'rfc__n_estimators': [200], 'tfidf__norm': ['l1'],
                         'tfidf__use_idf': [True],
                         'vect__ngram_range': [(1, 1)]},
             return_train_score=True, scoring='accuracy')

### Create a dataframe from the classification report and color the table for test and train data.

Test Data: 

Performed overall greater than $90$ on accuracy and percision and less than $90$ with recall.

In [233]:
report_test_rfc = classification_report(y_test, rfc.predict(X_test), digits=4, output_dict=True)

In [234]:
classification_report_rfc_df_test = pd.DataFrame(report_test_rfc).T

In [235]:
classification_report_rfc_df_test['support'] = classification_report_rfc_df_test.support.apply(int)

classification_report_rfc_df_test.style.background_gradient(cmap='coolwarm',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])


Unnamed: 0,precision,recall,f1-score,support
0,0.955103,0.991159,0.972797,16741
1,0.966302,0.844745,0.901444,5024
accuracy,0.957363,0.957363,0.957363,0
macro avg,0.960703,0.917952,0.937121,21765
weighted avg,0.957688,0.957363,0.956327,21765


In [236]:
report_train_rfc = classification_report(y_train, rfc.predict(X_train), digits=4, output_dict=True)

Train Data: 

Fitted data nearly perfectly on all metrics! Score tells us there was slight overfitting but over still modeled new data really well. 

In [237]:
classification_report_rfc_df_train = pd.DataFrame(report_train_rfc).T

In [238]:
classification_report_rfc_df_train['support'] = classification_report_rfc_df_train.support.apply(int)

classification_report_rfc_df_train.style.background_gradient(cmap='coolwarm',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])


Unnamed: 0,precision,recall,f1-score,support
0,0.999955,1.0,0.999978,66892
1,1.0,0.999851,0.999926,20168
accuracy,0.999966,0.999966,0.999966,0
macro avg,0.999978,0.999926,0.999952,87060
weighted avg,0.999966,0.999966,0.999966,87060


## Logistic Regression: TD-IDF

We use a Logistic Regression  method:

$$P= \frac{e^{a+b X}}{1+e^{a+bX}}$$

to model our training data using TD-IDF Vecotizer.

In [239]:
text_lr= Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('lr', LogisticRegression())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1)],
    'tfidf__use_idf': [True],
    'tfidf__norm': ('l1', 'l2'),
    'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'lr__penalty': ['l1','l2'] }

In [240]:
lr = GridSearchCV(text_lr, tuned_parameters, cv=5, scoring='accuracy', return_train_score=True)
lr.fit(X_train, y_train)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-pac

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('lr', LogisticRegression())]),
             param_grid={'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'lr__penalty': ['l1', 'l2'],
                         'tfidf__norm': ('l1', 'l2'), 'tfidf__use_idf': [True],
                         'vect__ngram_range': [(1, 1)]},
             return_train_score=True, scoring='accuracy')

### Create a dataframe from the classification report and color the table for test and train data.

Test Data:

Performed overall greater than $90$ on accuracy and percision and less than $90$ with recall.

In [241]:
report_test_lr = classification_report(y_test, lr.predict(X_test), digits=4, output_dict=True)

In [242]:
classification_report_lr_df_test = pd.DataFrame(report_test_lr).T

In [243]:
classification_report_lr_df_test['support'] = classification_report_lr_df_test.support.apply(int)

classification_report_lr_df_test.style.background_gradient(cmap='coolwarm',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])


Unnamed: 0,precision,recall,f1-score,support
0,0.965772,0.98429,0.974943,16741
1,0.944078,0.883758,0.912923,5024
accuracy,0.961084,0.961084,0.961084,0
macro avg,0.954925,0.934024,0.943933,21765
weighted avg,0.960764,0.961084,0.960627,21765


Train Data Classification:

Model was fitted nearly perfect on training data. There was slight overfitting but still perfomed well on new data.

In [244]:
report_train_lr = classification_report(y_train, lr.predict(X_train), digits=4, output_dict=True)

In [245]:
classification_report_lr_df_train = pd.DataFrame(report_train_lr).T

In [246]:
classification_report_lr_df_train['support'] = classification_report_lr_df_train.support.apply(int)

classification_report_lr_df_train.style.background_gradient(cmap='coolwarm',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])


Unnamed: 0,precision,recall,f1-score,support
0,0.999686,0.999746,0.999716,66892
1,0.999157,0.998959,0.999058,20168
accuracy,0.999564,0.999564,0.999564,0
macro avg,0.999421,0.999352,0.999387,87060
weighted avg,0.999563,0.999564,0.999564,87060


## Random Forest Classifier: Count Vectorizer

In [249]:
text_rfc = Pipeline([('vect', CountVectorizer()),
                     ('rfc', RandomForestClassifier())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1)],
    'rfc__n_estimators': [200],
    'rfc__max_features': ['auto']
}

In [250]:
rfc = GridSearchCV(estimator = text_rfc, param_grid = tuned_parameters, cv=2, scoring='accuracy',return_train_score=True)
rfc.fit(X_train, y_train)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('rfc', RandomForestClassifier())]),
             param_grid={'rfc__max_features': ['auto'],
                         'rfc__n_estimators': [200],
                         'vect__ngram_range': [(1, 1)]},
             return_train_score=True, scoring='accuracy')

### Create a dataframe from the classification report and color the table for test and train data.

Test Data Classification:

Performed overall greater than $90$ on accuracy and percision and less than $90$ with recall.

In [251]:
report_test_rfc = classification_report(y_test, rfc.predict(X_test), digits=4, output_dict=True)

In [252]:
classification_report_rfc_df_test = pd.DataFrame(report_test_rfc).T

In [253]:
classification_report_rfc_df_test['support'] = classification_report_rfc_df_test.support.apply(int)

classification_report_rfc_df_test.style.background_gradient(cmap='coolwarm',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])

Unnamed: 0,precision,recall,f1-score,support
0,0.954344,0.992653,0.973122,16741
1,0.971737,0.84176,0.90209,5024
accuracy,0.957822,0.957822,0.957822,0
macro avg,0.963041,0.917206,0.937606,21765
weighted avg,0.958359,0.957822,0.956726,21765


Train Data Classification:

Model was fitted nearly perfect on training data. There was slight overfitting but still perfomed well on new data.

In [254]:
report_train_rfc = classification_report(y_train, rfc.predict(X_train), digits=4, output_dict=True)

In [255]:
classification_report_rfc_df_train = pd.DataFrame(report_train_rfc).T

In [256]:
classification_report_rfc_df_train['support'] = classification_report_rfc_df_train.support.apply(int)

classification_report_rfc_df_train.style.background_gradient(cmap='coolwarm',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])

Unnamed: 0,precision,recall,f1-score,support
0,0.999955,1.0,0.999978,66892
1,1.0,0.999851,0.999926,20168
accuracy,0.999966,0.999966,0.999966,0
macro avg,0.999978,0.999926,0.999952,87060
weighted avg,0.999966,0.999966,0.999966,87060
