# Project 3: Webscraping, NLP and classification modelling

# Contents:

1) Model 1 - TF-IDF and Random Forest  
2) Model 2 - TF-IDF and Extra Trees   
3) Model 3 - TF-IDF and Support Vector Machine   
4) Model 4 - TF-IDF and ADA Boost  
5) Model 4 - TF-IDF and Gradient Boost  
6) Advanced Model comparison  

In [2]:
# library imports
import requests
import time
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm

# preprocessing imports
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# modeling imports
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
df = pd.read_csv('../datasets/cleaned_for_modelling_30_nov_df.csv')

In [4]:
X = df['cleaned_text']
y = df['belongs_to_sub2']

In [5]:
y.value_counts()

1    901
0    842
Name: belongs_to_sub2, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [7]:
X_train.shape

(1307,)

In [8]:
X_test.shape

(436,)

In [9]:
y_train.shape

(1307,)

In [10]:
y_test.shape

(436,)

# Model 1: TFIDF and Random Forest Classifer

In [6]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('rf', RandomForestClassifier())
])

params = {
                'tf__max_features' : [5000,10_000,15_000],
                'tf__ngram_range' : [(1,1), (1,2)]
}


gs = GridSearchCV(pipe, param_grid = params, cv = 5)


In [7]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             param_grid={'tf__max_features': [5000, 10000, 15000],
                         'tf__ngram_range': [(1, 1), (1, 2)]})

In [8]:
print(gs.best_params_)
gs_model = gs.best_estimator_

gs_train_accuracy = round(gs_model.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs_model.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

{'tf__max_features': 10000, 'tf__ngram_range': (1, 2)}
Train Accuracy: 1.0
Test Accuracy: 0.94


In [9]:
y_pred = gs.predict(X_test)
cm_def = np.array([['True Negative', 'False Positive'],
                ['False Negative', 'True Positive']])
cm_def = pd.DataFrame(cm_def,columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
display(cm_def)
display(cm_df)

Unnamed: 0,pred neg,pred pos
actual neg,True Negative,False Positive
actual pos,False Negative,True Positive


Unnamed: 0,pred neg,pred pos
actual neg,202,9
actual pos,18,207


# Model 2: TFIDF and Extra Trees Classifer

In [10]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('et', ExtraTreesClassifier())
])

params = {
                'tf__max_features' : [5000,10_000,15_000],
                'tf__ngram_range' : [(1,1), (1,2)]
}


gs = GridSearchCV(pipe, param_grid = params, cv = 5)

In [11]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('et', ExtraTreesClassifier())]),
             param_grid={'tf__max_features': [5000, 10000, 15000],
                         'tf__ngram_range': [(1, 1), (1, 2)]})

In [12]:
print(gs.best_params_)
gs_model = gs.best_estimator_

gs_train_accuracy = round(gs_model.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs_model.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

{'tf__max_features': 10000, 'tf__ngram_range': (1, 2)}
Train Accuracy: 1.0
Test Accuracy: 0.94


In [13]:
y_pred = gs.predict(X_test)
cm_def = np.array([['True Negative', 'False Positive'],
                ['False Negative', 'True Positive']])
cm_def = pd.DataFrame(cm_def,columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
display(cm_def)
display(cm_df)

Unnamed: 0,pred neg,pred pos
actual neg,True Negative,False Positive
actual pos,False Negative,True Positive


Unnamed: 0,pred neg,pred pos
actual neg,199,12
actual pos,15,210


# Model 3: TFIDF and Support Vector Machine Classifier

In [14]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('svc', SVC())
])

params = {
                'tf__max_features' : [5000,10_000,15_000],
                'tf__ngram_range' : [(1,1), (1,2)]
}


gs = GridSearchCV(pipe, param_grid = params, cv = 5)

In [15]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('svc', SVC())]),
             param_grid={'tf__max_features': [5000, 10000, 15000],
                         'tf__ngram_range': [(1, 1), (1, 2)]})

In [16]:
print(gs.best_params_)
gs_model = gs.best_estimator_

gs_train_accuracy = round(gs_model.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs_model.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

{'tf__max_features': 10000, 'tf__ngram_range': (1, 1)}
Train Accuracy: 1.0
Test Accuracy: 0.93


In [17]:
y_pred = gs.predict(X_test)
cm_def = np.array([['True Negative', 'False Positive'],
                ['False Negative', 'True Positive']])
cm_def = pd.DataFrame(cm_def,columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
display(cm_def)
display(cm_df)

Unnamed: 0,pred neg,pred pos
actual neg,True Negative,False Positive
actual pos,False Negative,True Positive


Unnamed: 0,pred neg,pred pos
actual neg,195,16
actual pos,15,210


# Model 4: TFIDF and ADA Boost Classifier

In [18]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('ada', AdaBoostClassifier())
])

params = {
                'tf__max_features' : [5000,10_000,15_000],
                'tf__ngram_range' : [(1,1), (1,2)]
}


gs = GridSearchCV(pipe, param_grid = params, cv = 5)

In [19]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('ada', AdaBoostClassifier())]),
             param_grid={'tf__max_features': [5000, 10000, 15000],
                         'tf__ngram_range': [(1, 1), (1, 2)]})

In [20]:
print(gs.best_params_)
gs_model = gs.best_estimator_

gs_train_accuracy = round(gs_model.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs_model.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

{'tf__max_features': 15000, 'tf__ngram_range': (1, 1)}
Train Accuracy: 0.96
Test Accuracy: 0.89


In [21]:
y_pred = gs.predict(X_test)
cm_def = np.array([['True Negative', 'False Positive'],
                ['False Negative', 'True Positive']])
cm_def = pd.DataFrame(cm_def,columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
display(cm_def)
display(cm_df)

Unnamed: 0,pred neg,pred pos
actual neg,True Negative,False Positive
actual pos,False Negative,True Positive


Unnamed: 0,pred neg,pred pos
actual neg,190,21
actual pos,27,198


# Model 5: TFIDF and Gradient Boost Classifier

In [22]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('gr', GradientBoostingClassifier())
])

params = {
                'tf__max_features' : [5000,10_000,15_000],
                'tf__ngram_range' : [(1,1), (1,2)]
}


gs = GridSearchCV(pipe, param_grid = params, cv = 5)

In [23]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('gr', GradientBoostingClassifier())]),
             param_grid={'tf__max_features': [5000, 10000, 15000],
                         'tf__ngram_range': [(1, 1), (1, 2)]})

In [24]:
print(gs.best_params_)
gs_model = gs.best_estimator_

gs_train_accuracy = round(gs_model.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs_model.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

{'tf__max_features': 15000, 'tf__ngram_range': (1, 2)}
Train Accuracy: 1.0
Test Accuracy: 0.92


In [25]:
y_pred = gs.predict(X_test)
cm_def = np.array([['True Negative', 'False Positive'],
                ['False Negative', 'True Positive']])
cm_def = pd.DataFrame(cm_def,columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns = ['pred neg', 'pred pos'], index = ['actual neg','actual pos'])
display(cm_def)
display(cm_df)

Unnamed: 0,pred neg,pred pos
actual neg,True Negative,False Positive
actual pos,False Negative,True Positive


Unnamed: 0,pred neg,pred pos
actual neg,199,12
actual pos,25,200


# From the advanced models-  Random Forest, Extra Trees and SVM are performing the best on accuracy. 

# In the next step, I'll evaluate the following models on 'true' unseen data, with a fresh scrape on Reddit


Transformer: TF-IDF  
Estimator: Naive Bayes (Hyperparameters - 'tf__max_features': 10000, 'tf__ngram_range': (1, 2))  
Accuracy: 94%


Transformer: CountVectorizer  
Estimator: Naive Bayes (Hyperparameters - 'cvec__max_df': 0.9, 'cvec__max_features': 10000, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 2)}  
Accuracy: 95%. 


Transformer: TF-IDF  
Estimator: Random Forest (Hyperparameters - 'tf__max_features': 10000, 'tf__ngram_range': (1, 2))  
Accuracy: 93%


Transformer: TF-IDF
Estimator: Extra Trees (Hyperparameters - 'tf__max_features': 10000, 'tf__ngram_range': (1, 2))  
Accuracy: 93%.

Transformer: TF-IDF
Estimator: SVM (Hyperparameters - 'tf__max_features': 10000, 'tf__ngram_range': (1, 1))  
Accuracy: 93%.