In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb
from classifiers import Classifiers
from nltk.corpus import stopwords

In [2]:
scrape_path_0="../scrape_file/scrapes_atheism.pickle"
scrape_path_1="../scrape_file/scrapes_christianity.pickle"

try:
    with open(scrape_path_0,'rb') as handle:
        scrape0=pickle.load(handle)
    with open(scrape_path_1,'rb') as handle:
        scrape1=pickle.load(handle)
except FileNotFoundError as e:
    e.strerror = "Pls run 01_scrape_reddit first to pull the data."
    raise e

df0=pd.DataFrame(scrape0,columns=["post"])
df0['label']=0

df1=pd.DataFrame(scrape1,columns=["post"])
df1['label']=1

df=pd.concat([df0,df1])
df.reset_index(inplace=True,drop=True)

### Prepare the dataset

In [3]:
X=df['post']
y=df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=123)

In [4]:
X

0        It’s actually just sad that I see these pastor...
1        TLDR: I love my Christian mother, but she has ...
2        I literally posted here like an hour or so ago...
3        My wife and I are in the middle of loading the...
4        This is the nail in the coffin for the idea of...
                               ...                        
10076    Jesus lived a perfect life, and died in your p...
10077    Why don't nuns wear the blue coat, red dress a...
10078    I've had a lack of appetite for sometime and n...
10079    Title says it all. Wondering if anyone can giv...
10080    You don't have to be in Tennessee. If you have...
Name: post, Length: 10081, dtype: object

In [5]:
cv=CountVectorizer(stop_words=stopwords.words("english"),token_pattern="[^\W\d_]+")

cv.fit_transform(X)

arr_=cv.get_feature_names_out()
#number of features
len(arr_)

36200

## Run Preliminary Classifiers

In [6]:
def create_pipe(cls):
    return Pipeline([
        ('cvec',CountVectorizer(stop_words=stopwords.words("english"),
                                token_pattern="[^\W\d_]+",
                               max_features=2000)),
#                                max_df=0.95,min_df=0.2,max_features=10000)),        
        ('cls',cls)
    ])

In [7]:
class_obj=Classifiers(create_pipe,X,y,123)

In [8]:
classifiers=[
    {
        'cls':MultinomialNB(),
        'name':'Naive Bayes'
    },
    {
        'cls':RandomForestClassifier(n_jobs=-1),
        'name':'Random Forest'
    },
#     {
#         'cls':SVC(),
#         'name':'SVC'
#     },
    {
        'cls':xgb.XGBClassifier(objective="binary:logistic"),
        'name':'XGBoost'
    },
]

In [9]:
for i in classifiers:
    class_obj.run_classifier(i['cls'],i['name'])

--Train--
Accuracy: 0.8077
--Test--
Accuracy: 0.7883
Precision: 0.8678
Recall: 0.6815
Specificity: 0.8956
F1: 0.7634
--Train--
Accuracy: 0.9994
--Test--
Accuracy: 0.8131
Precision: 0.8323
Recall: 0.7854
Specificity: 0.8410
F1: 0.8081
---Other metrics regarding Random Forest---
Average tree depth: 219.25
--Train--
Accuracy: 0.9240
--Test--
Accuracy: 0.8161
Precision: 0.8065
Recall: 0.8328
Specificity: 0.7992
F1: 0.8195


Running the 3 classifiers, we see that train accuracy and test accuracy is quite far apart.  
There is significant overfit.  
Furthermore, since all 3 models are overfitting, this suggests the problem is with the training features, not so much of the model tuning.  
We will deal with this subsequently.