In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from utils import create_pipe, run_classifiers

In [2]:
#open data file

path="../data/combined.pickle"

try:
    with open(path,'rb') as handle:
        pickleload=pickle.load(handle)
except FileNotFoundError as e:
    e.strerror = "Pls run 01_scrape_reddit first to pull the data and 02_EDA to merge data."
    raise e

df=pd.DataFrame(pickleload)

df

Unnamed: 0,post,label
0,this is the nail in the coffin for the idea of...,0
1,"i’m closeted, always been, and always will be....",0
2,the fifa world cup in qatar should be a remind...,0
3,we moved from the dc metro area last year to t...,0
4,they spend so much time focusing on arbitrary ...,0
...,...,...
10078,"if jesus died for our sins, what's keeping u f...",1
10079,hello everybody it may seem like a dumb questi...,1
10080,today's readings: 1 corinthian 1:4-8 &gt;i tha...,1
10081,i don't propose this question in the sense of ...,1


### Prepare the dataset

In [3]:
X=df['post']
y=df['label']

## Add Random Forest and XGBoost into the mix

and some parameters to RandomizedSearch for.

In [4]:
classifiers_list=[
    {
        'cls':MultinomialNB(),
        'name':'NaiveBayes',
        'fixed_params':{'min_df': 0.05, 'max_features': 3500, 'max_df': 0.8,'use_idf': False, 'ngram_range': (1, 2)},
    },
    {
        'cls':RandomForestClassifier(),
        'name':'RandomForest',
        'fixed_params':{'min_df': 0.05, 'max_features': 3500, 'max_df': 0.8,'use_idf': False, 'ngram_range': (1, 2)},
        'float_params':{
            'cls__max_depth':[60,80,100,120,140],
            'cls__min_samples_split':[3,4,5,6,7],
            'cls__min_samples_leaf':[2,3,4]
        }
    },
    {
        'cls':XGBClassifier(),
        'name':'XGBoost',
        'fixed_params':{'min_df': 0.05, 'max_features': 3500, 'max_df': 0.8,'use_idf': False, 'ngram_range': (1, 2)},
        'float_params':{
            'cls__eta':[0.08,0.1,0.12],
            'cls__gamma':[1,10,20,50],
            'cls__max_depth':[6,7,8],
            'cls__subsample_bytree':[0.5,0.7,0.9]
        }
    }
]
run_classifiers(classifiers_list,X,y,tfidf=True)

ROC AUC with CV=5: 0.8257169538682693
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters and accuracy
{'cls__min_samples_split': 7, 'cls__min_samples_leaf': 4, 'cls__max_depth': 60}
ROC AUC with CV=5: 0.8351760606719554
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Parameters: { "subsample_bytree" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Best parameters and accuracy
{'cls__subsample_bytree': 0.9, 'cls__max_depth': 6, 'cls__gamma': 1, 'cls__eta': 0.1}
ROC AUC with CV=5: 0.8340581211198932


So here we see,

|Model|Vectorizer|ROC AUC score|
|---|---|---|
|Naive Bayes|CountVectorizer|81.2%|
|Naive Bayes|TfidfVectorizer|82.6%|
|Random Forest|TfidfVectorizer|83.5%|
|XGBoost|TfidfVectorizer|83.4%|

RF and XGBoost gave very close scores.  
We pick XGBoost ahead of RF as XGBoost has significant performance advantage over Random Forest.