# Project 3: Webscraping, NLP and classification modelling

# Contents:

1) Fresh train test split using new reddit pull  
2) Running the model and answering the business question

In [1]:
# library imports
import requests
import time
import pandas as pd
import numpy as np
import ast
import re
from tqdm import tqdm

# preprocessing imports
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords 

# modeling imports
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:

train_df = pd.read_csv('../datasets/cleaned_for_modelling_30_nov_df.csv')

In [3]:
test_df = pd.read_csv('../datasets/fresh_test_data_cleaned_for_modelling_4dec_df.csv')

In [4]:
train_df.head()

Unnamed: 0,title,post,upvotes,gilded,belongs_to_sub2,title_x_post,token,joined_text,highly_upvoted,word_count,char_count,cleaned_text
0,Just need to vent...,"I get that tempers are shorter these days, but...",230,0,0,Just need to vent... I get that tempers are sh...,"['need', 'vent', 'get', 'tempers', 'shorter', ...",Just need to vent... I get that tempers are sh...,1,48,579,need vent get tempers shorter days hard time l...
1,Reverse call center post,On mobile so I hope I do this right. \n\nI ha...,39,0,0,Reverse call center post On mobile so I hope I...,"['reverse', 'call', 'center', 'post', 'mobile'...",Reverse call center post On mobile so I hope I...,0,84,1017,reverse call center post mobile hope right cal...
2,"""So you're willing to lose a customer for $3 d...",I work for a car rental company as a specialis...,763,0,0,"""So you're willing to lose a customer for $3 d...","['willing', 'lose', 'customer', 'dollars', 'wo...","""So you're willing to lose a customer for $3 d...",1,585,6757,willing lose customer dollars work car rental ...
3,Free Talk Friday - Nov 27,Welcome to Free Talk Friday! We are suspending...,0,0,0,Free Talk Friday - Nov 27 Welcome to Free Talk...,"['free', 'talk', 'friday', 'nov', 'welcome', '...",Free Talk Friday - Nov 27 Welcome to Free Talk...,0,38,368,free talk friday nov welcome free talk friday ...
4,Accidentally Exposed a Family Fraud,I work for a small local ISP. One of the thin...,958,0,0,Accidentally Exposed a Family Fraud I work for...,"['accidentally', 'exposed', 'family', 'fraud',...",Accidentally Exposed a Family Fraud I work for...,1,366,3608,accidentally exposed family fraud work small l...


In [5]:
test_df.head()

Unnamed: 0,title,post,upvotes,gilded,belongs_to_sub2,title_x_post,cleaned_text
0,Your Son is Seven and He's Getting WHAT?,So last night I had a very bizarre call from s...,412,0,0,Your Son is Seven and He's Getting WHAT? So la...,son seven getting last night bizarre call some...
1,No. You don’t get to speak to a manager.,"So I had this call from a third party, which i...",755,0,0,No. You don’t get to speak to a manager. So I ...,get speak manager call third party nothing new...
2,I am losing faith in humanity!,Why people give their SSN and DOB to random pe...,18,0,0,I am losing faith in humanity! Why people give...,losing faith humanity people give ssn dob rand...
3,"Thanks for being Racist, Susan.",I am 50% white but I have an uncommon first na...,8,0,0,"Thanks for being Racist, Susan. I am 50% white...",thanks racist susan white uncommon first name ...
4,Let me vent to you about a dumb call I had today,So I am a team leader. We will call the custom...,5,0,0,Let me vent to you about a dumb call I had tod...,let vent dumb call today team leader call cust...


In [6]:
X_test = test_df['cleaned_text']
X_test.shape

(801,)

In [7]:
y_test = test_df['belongs_to_sub2']
y_test.shape

(801,)

In [8]:
X_train = train_df['cleaned_text']
y_train = train_df['belongs_to_sub2']

In [9]:
X_train.shape

(1743,)

In [10]:
y_train.shape

(1743,)

In [11]:
train_df.shape

(1743, 12)

In [24]:
y_train.value_counts(normalize = True)

1    0.516925
0    0.483075
Name: belongs_to_sub2, dtype: float64

# Models to be tested


Transformer: TF-IDF  
Estimator: Naive Bayes (Hyperparameters - 'tf__max_features': 10000, 'tf__ngram_range': (1, 2))  
Accuracy: 94%


Transformer: CountVectorizer  
Estimator: Naive Bayes (Hyperparameters - 'cvec__max_df': 0.9, 'cvec__max_features': 10000, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 2)}  
Accuracy: 95%. 


Transformer: TF-IDF  
Estimator: Random Forest (Hyperparameters - 'tf__max_features': 10000, 'tf__ngram_range': (1, 2))  
Accuracy: 93%


Transformer: TF-IDF
Estimator: Extra Trees (Hyperparameters - 'tf__max_features': 10000, 'tf__ngram_range': (1, 2))  
Accuracy: 93%.

Transformer: TF-IDF
Estimator: SVM (Hyperparameters - 'tf__max_features': 10000, 'tf__ngram_range': (1, 1))  
Accuracy: 93%.

Transformer: TF-IDF  
Estimator: Log Reg (Hyperparameters - 'tf__max_features': 15000, 'tf__ngram_range': (1, 2))  
Accuracy: 93%


# Model 1: TF-IDF and NaiveBayes

In [13]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('nb', MultinomialNB())
])

params = {
                'tf__max_features' : [10_000,],
                'tf__ngram_range' : [(1,2)]
}


gs1 = GridSearchCV(pipe, param_grid = params, cv = 5)

gs1.fit(X_train, y_train)

gs_train_accuracy = round(gs1.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs1.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

Train Accuracy: 0.97
Test Accuracy: 0.97


In [14]:
tn, fp, fn, tp = confusion_matrix(y_test, gs1.predict(X_test)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))


True Negatives: 383
False Positives: 17
False Negatives: 11
True Positives: 390

Accuracy:  0.9650436953807741
Sensitivity:  0.972568578553616
Specificity:  0.9575
Precision:  0.9582309582309583


# Model 2: CountVectorizer and NaiveBayes

In [15]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])


params = {
    'cvec__max_features' : [10_000],
    'cvec__min_df' : [3],
    'cvec__max_df': [0.9],
    'cvec__ngram_range' : [(1,2)]
}


gs2 = GridSearchCV(pipe, param_grid = params, cv = 5)

gs2.fit(X_train, y_train)

gs_train_accuracy = round(gs2.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs2.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

Train Accuracy: 0.97
Test Accuracy: 0.97


In [16]:
tn, fp, fn, tp = confusion_matrix(y_test, gs2.predict(X_test)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))



True Negatives: 391
False Positives: 9
False Negatives: 13
True Positives: 388

Accuracy:  0.9725343320848939
Sensitivity:  0.9675810473815462
Specificity:  0.9775
Precision:  0.9773299748110831


# Model 3: TF-IDF and Random Forest

In [17]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('rf', RandomForestClassifier())
])

params = {
                'tf__max_features' : [10_000,],
                'tf__ngram_range' : [(1,2)]
}


gs3 = GridSearchCV(pipe, param_grid = params, cv = 5)

gs3.fit(X_train, y_train)

gs_train_accuracy = round(gs3.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs3.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

Train Accuracy: 1.0
Test Accuracy: 1.0


In [18]:
tn, fp, fn, tp = confusion_matrix(y_test, gs3.predict(X_test)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))



True Negatives: 398
False Positives: 2
False Negatives: 0
True Positives: 401

Accuracy:  0.9975031210986267
Sensitivity:  1.0
Specificity:  0.995
Precision:  0.9950372208436724


# Model 4: TF-IDF and Extra Trees

In [19]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('et', ExtraTreesClassifier())
])

params = {
                'tf__max_features' : [10_000,],
                'tf__ngram_range' : [(1,2)]
}


gs4 = GridSearchCV(pipe, param_grid = params, cv = 5)

gs4.fit(X_train, y_train)

gs_train_accuracy = round(gs4.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs4.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

Train Accuracy: 1.0
Test Accuracy: 1.0


In [20]:
tn, fp, fn, tp = confusion_matrix(y_test, gs4.predict(X_test)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))




True Negatives: 397
False Positives: 3
False Negatives: 1
True Positives: 400

Accuracy:  0.9950062421972534
Sensitivity:  0.9975062344139651
Specificity:  0.9925
Precision:  0.9925558312655087


# Model 5: TF-IDF and SVM

In [21]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('svc', SVC())
])

params = {
                'tf__max_features' : [10_000,],
                'tf__ngram_range' : [(1,2)]
}


gs5 = GridSearchCV(pipe, param_grid = params, cv = 5)

gs5.fit(X_train, y_train)

gs_train_accuracy = round(gs5.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs5.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

Train Accuracy: 1.0
Test Accuracy: 1.0


In [22]:
tn, fp, fn, tp = confusion_matrix(y_test, gs5.predict(X_test)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))




True Negatives: 397
False Positives: 3
False Negatives: 1
True Positives: 400

Accuracy:  0.9950062421972534
Sensitivity:  0.9975062344139651
Specificity:  0.9925
Precision:  0.9925558312655087


# Model 6: TF-IDF and Logistic Regression

In [26]:
pipe = Pipeline([
                ('tf', TfidfVectorizer()),
                ('lr', LogisticRegression())
])

params = {
                'tf__max_features' : [15_000,],
                'tf__ngram_range' : [(1,2)]
}


gs5 = GridSearchCV(pipe, param_grid = params, cv = 5)

gs5.fit(X_train, y_train)

gs_train_accuracy = round(gs5.score(X_train, y_train),2)
print(f'Train Accuracy: {gs_train_accuracy}')

gs_test_accuracy = round(gs5.score(X_test, y_test),2)
print(f'Test Accuracy: {gs_test_accuracy}')

Train Accuracy: 0.98
Test Accuracy: 0.98


In [27]:
tn, fp, fn, tp = confusion_matrix(y_test, gs5.predict(X_test)).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

print("\nAccuracy: ", (tn + tp) / (tn + fp + fn + tp))
print("Sensitivity: ", tp / (tp + fn))
print("Specificity: ", tn / (tn + fp))
print("Precision: ", tp / (tp + fp))





True Negatives: 392
False Positives: 8
False Negatives: 8
True Positives: 393

Accuracy:  0.9800249687890137
Sensitivity:  0.9800498753117207
Specificity:  0.98
Precision:  0.9800498753117207


## Reminder of Problem Statement: 


### What are the indicative words that help to effectively target advertising to a niche user group (tech support staff)? 

## Reminder of Data Science Challenge: 

### How can I accurately identify posts that belong to 2 subreddits that are very similar in purpose but show differences of nuance?

### With accuracy score of 99.7% on 'true' unseen data, Random Forest is theoretically the best model to answer the data science challenge. 

### However, Logistic Regression is a better way to answer the business problem, as the model is interpretable and I can get relevant keywords for better classification. With 98% accurary in logistic regression, I am confident to recommend this model for better advertising targeting.


