In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import xgboost as xgb
from classifiers import Classifiers
from nltk.corpus import stopwords

In [2]:
scrape_path_0="../scrape_file/scrapes_atheism.pickle"
scrape_path_1="../scrape_file/scrapes_christianity.pickle"

try:
    with open(scrape_path_0,'rb') as handle:
        scrape0=pickle.load(handle)
    with open(scrape_path_1,'rb') as handle:
        scrape1=pickle.load(handle)
except FileNotFoundError as e:
    e.strerror = "Pls run 01_scrape_reddit first to pull the data."
    raise e

df0=pd.DataFrame(scrape0,columns=["post"])
df0['label']=0

df1=pd.DataFrame(scrape1,columns=["post"])
df1['label']=1

df=pd.concat([df0,df1])
df.reset_index(inplace=True,drop=True)

### Prepare the dataset

In [3]:
X=df['post']
y=df['label']

## Modify Pipeline for TD-IDF

In [4]:
def create_pipe(cls):
    return Pipeline([
        ('tfidf',TfidfVectorizer(stop_words=stopwords.words("english"))),
        ('cls',cls)
    ])

In [5]:
class_obj=Classifiers(create_pipe,X,y,123)

In [6]:
classifiers=[
    {
        'cls':RandomForestClassifier(n_jobs=-1),
        'name':'Random Forest'
    },
    {
        'cls':SVC(),
        'name':'SVC'
    },
    {
        'cls':xgb.XGBClassifier(objective="binary:logistic"),
        'name':'XGBoost'
    },
]

In [7]:
for i in classifiers:
    class_obj.run_classifier(i['cls'],i['name'])

--Train--
Accuracy: 0.9995
--Test--
Accuracy: 0.8071
Precision: 0.8116
Recall: 0.8012
Specificity: 0.8131
F1: 0.8064
---Other metrics regarding Random Forest---
Average tree depth: 283.39
--Train--
Accuracy: 0.9880
--Test--
Accuracy: 0.8270
Precision: 0.8492
Recall: 0.7962
Specificity: 0.8579
F1: 0.8218
--Train--
Accuracy: 0.9444
--Test--
Accuracy: 0.8111
Precision: 0.8094
Recall: 0.8150
Specificity: 0.8072
F1: 0.8122


Looks like TF-IDF doesn't help much either. Let's look at other methods.