In [106]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import xgboost as xgb

In [2]:
scrape_path_0="../scrape_file/scrapes_atheism.pickle"
scrape_path_1="../scrape_file/scrapes_christianity.pickle"

In [3]:
with open(scrape_path_0,'rb') as handle:
    scrape0=pickle.load(handle)
with open(scrape_path_1,'rb') as handle:
    scrape1=pickle.load(handle)

In [22]:
df0=pd.DataFrame(scrape0,columns=["post"])
df0['label']=0

df1=pd.DataFrame(scrape1,columns=["post"])
df1['label']=1

df=pd.concat([df0,df1])
df.reset_index(inplace=True,drop=True)

In [23]:
df

Unnamed: 0,post,label
0,This is the nail in the coffin for the idea of...,0
1,"I’m closeted, always been, and always will be....",0
2,The FIFA world cup in Qatar should be a remind...,0
3,We moved from the DC metro area last year to t...,0
4,They spend so much time focusing on arbitrary ...,0
...,...,...
10078,"If Jesus died for our sins, what's keeping us ...",1
10079,Hello everybody it may seem like a dumb questi...,1
10080,Today's readings:\n\n1 Corinthians 1:4-8\n\n&g...,1
10081,I don't propose this question in the sense of ...,1


In [24]:
df['label'].mean()

0.5013388872359417

Quite an equal representation (50%) of labels are 1 and 0.

In [28]:
X=df['post']
y=df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=123)

In [29]:
X

0        This is the nail in the coffin for the idea of...
1        I’m closeted, always been, and always will be....
2        The FIFA world cup in Qatar should be a remind...
3        We moved from the DC metro area last year to t...
4        They spend so much time focusing on arbitrary ...
                               ...                        
10078    If Jesus died for our sins, what's keeping us ...
10079    Hello everybody it may seem like a dumb questi...
10080    Today's readings:\n\n1 Corinthians 1:4-8\n\n&g...
10081    I don't propose this question in the sense of ...
10082    I'd like to preface this by saying that I've n...
Name: post, Length: 10083, dtype: object

In [41]:
cv=CountVectorizer(stop_words='english')
cv.fit(X_train)

In [42]:
X_train_trf=cv.transform(X_train)

In [43]:
X_train_trf.shape

(8066, 33821)

In [105]:
cv.get_feature_names()[0:20]

['00',
 '000',
 '00050571',
 '000d3ad24077',
 '000d3ad24a0d',
 '000d3ad24c60',
 '000yrs',
 '001',
 '00120',
 '0055',
 '00729735256',
 '00a',
 '00am',
 '00s',
 '01',
 '013',
 '01660654507',
 '01zgzjcqmse',
 '02',
 '0280270']

In [48]:
X_train_trf.todense()[0]

matrix([[0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [51]:
X_train[0]

'This is the nail in the coffin for the idea of the Christian God, imo. I 100% believed the Bible, believed in God, Jesus, the Holy Ghost, miracles, Angels, Devils, Demons, Heaven, Hell and the soul. I was indoctrinated from BIRTH to believe it and by golly it worked. I wanted to become a preacher, I wanted to tell people about Jesus because I truly believed they were going to hell! So how, after 30 years of head-in-the-sand belief, would God suddenly not be real to me? I honestly got "saved", baptized, and believed in literally everything. And now? Nothing. I believe there is no "supreme being". With literally everything in my life geared toward believing in God and being surrounded by what is said by Christians to be God\'s creation, it should be *impossible* for me to not believe. I shouldn\'t even be able to get here. The fact that I can NOT believe in God after all the conditioning in my life to believe in him, is the biggest personal proof I have that there is no God. \n\nIf thei

## Run Preliminary Classifiers

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import xgboost as xgb

In [108]:
def create_pipe(cls):
    return Pipeline([
        ('cvec',CountVectorizer(stop_words='english')),
        ('cls',cls)
    ])

In [109]:
def run_classifier(cls,name):
    print(f"========= Running classifier: {name} =========")
    pipe=create_pipe(cls)
    pipe.fit(X_train,y_train)
    print(f"Training accuracy: {pipe_rf.score(X_train,y_train):.4f}")
    print(f"Test accuracy: {pipe_rf.score(X_test,y_test):.4f}")
    
    tn, fp, fn, tp = confusion_matrix(y_test, pipe.predict(X_test)).ravel()
    prec=tp/(tp+fp)
    recall=tp/(tp+fn)
    spec=tn/(tn+fp)
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Specificity: {spec:.4f}")
    print(f"F1: {2*(prec*recall)/(prec+recall):.4f}")

In [110]:
classifiers=[
    {
        'cls':RandomForestClassifier(n_jobs=-1),
        'name':'Random Forest'
    },
    {
        'cls':SVC(),
        'name':'SVC'
    },
    {
        'cls':xgb.XGBClassifier(objective="binary:logistic"),
        'name':'XGBoost'
    },
]

In [111]:
for i in classifiers:
    run_classifier(i['cls'],i['name'])

Training accuracy: 0.9995
Test accuracy: 0.7987
Precision: 0.8042
Recall: 0.8368
Specificity: 0.7952
F1: 0.8202
Training accuracy: 0.9995
Test accuracy: 0.7987
Precision: 0.7670
Recall: 0.8724
Specificity: 0.7336
F1: 0.8163
Training accuracy: 0.9995
Test accuracy: 0.7987
Precision: 0.7943
Recall: 0.8477
Specificity: 0.7793
F1: 0.8201


Running the 3 classifiers, we see that train accuracy and test accuracy is quite far apart.  
There is significant overfit.  
We will deal with this subsequently.