In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import xgboost as xgb
from classifiers import Classifiers
from nltk.corpus import stopwords

In [2]:
scrape_path_0="../scrape_file/scrapes_atheism.pickle"
scrape_path_1="../scrape_file/scrapes_christianity.pickle"

try:
    with open(scrape_path_0,'rb') as handle:
        scrape0=pickle.load(handle)
    with open(scrape_path_1,'rb') as handle:
        scrape1=pickle.load(handle)
except FileNotFoundError as e:
    e.strerror = "Pls run 01_scrape_reddit first to pull the data."
    raise e

df0=pd.DataFrame(scrape0,columns=["post"])
df0['label']=0

df1=pd.DataFrame(scrape1,columns=["post"])
df1['label']=1

df=pd.concat([df0,df1])
df.reset_index(inplace=True,drop=True)

### Prepare the dataset

In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
lemmatizer = WordNetLemmatizer()

regexptokenizer = RegexpTokenizer(r'\w+')

In [4]:
df['post']=df['post'].apply(lambda sentence: regexptokenizer.tokenize(sentence.lower()))
df['post']

0        [it, s, actually, just, sad, that, i, see, the...
1        [tldr, i, love, my, christian, mother, but, sh...
2        [i, literally, posted, here, like, an, hour, o...
3        [my, wife, and, i, are, in, the, middle, of, l...
4        [this, is, the, nail, in, the, coffin, for, th...
                               ...                        
10076    [jesus, lived, a, perfect, life, and, died, in...
10077    [why, don, t, nuns, wear, the, blue, coat, red...
10078    [i, ve, had, a, lack, of, appetite, for, somet...
10079    [title, says, it, all, wondering, if, anyone, ...
10080    [you, don, t, have, to, be, in, tennessee, if,...
Name: post, Length: 10081, dtype: object

In [5]:
df['post']=df['post'].apply(lambda sentence: [lemmatizer.lemmatize(word,pos='n') for word in sentence])
df['post']

0        [it, s, actually, just, sad, that, i, see, the...
1        [tldr, i, love, my, christian, mother, but, sh...
2        [i, literally, posted, here, like, an, hour, o...
3        [my, wife, and, i, are, in, the, middle, of, l...
4        [this, is, the, nail, in, the, coffin, for, th...
                               ...                        
10076    [jesus, lived, a, perfect, life, and, died, in...
10077    [why, don, t, nun, wear, the, blue, coat, red,...
10078    [i, ve, had, a, lack, of, appetite, for, somet...
10079    [title, say, it, all, wondering, if, anyone, c...
10080    [you, don, t, have, to, be, in, tennessee, if,...
Name: post, Length: 10081, dtype: object

In [6]:
df['post']=df['post'].apply(lambda sentence: [lemmatizer.lemmatize(word,pos='v') for word in sentence])
df['post']

0        [it, s, actually, just, sad, that, i, see, the...
1        [tldr, i, love, my, christian, mother, but, sh...
2        [i, literally, post, here, like, an, hour, or,...
3        [my, wife, and, i, be, in, the, middle, of, lo...
4        [this, be, the, nail, in, the, coffin, for, th...
                               ...                        
10076    [jesus, live, a, perfect, life, and, die, in, ...
10077    [why, don, t, nun, wear, the, blue, coat, red,...
10078    [i, ve, have, a, lack, of, appetite, for, some...
10079    [title, say, it, all, wonder, if, anyone, can,...
10080    [you, don, t, have, to, be, in, tennessee, if,...
Name: post, Length: 10081, dtype: object

In [7]:
df['post']=df['post'].apply(lambda text: ' '.join(text))

In [8]:
X=df['post']
y=df['label']

In [9]:
cv=CountVectorizer(stop_words=stopwords.words("english"),token_pattern="[^\W\d_]+")

cv.fit_transform(X)

arr_=cv.get_feature_names_out()
#number of features
len(arr_)
# arr_

27345

## Modify Pipeline for TD-IDF

In [10]:
def create_pipe(cls):
    return Pipeline([
        ('tfidf',TfidfVectorizer(stop_words=stopwords.words("english"),token_pattern="[^\W\d_]+")),
        ('cls',cls)
    ])

In [11]:
class_obj=Classifiers(create_pipe,X,y,123)

In [12]:
classifiers=[
    {
        'cls':RandomForestClassifier(n_jobs=-1),
        'name':'Random Forest'
    },
#     {
#         'cls':SVC(),
#         'name':'SVC'
#     },
    {
        'cls':xgb.XGBClassifier(objective="binary:logistic"),
        'name':'XGBoost'
    },
]

In [13]:
for i in classifiers:
    class_obj.run_classifier(i['cls'],i['name'])

--Train--
Accuracy: 0.9995
--Test--
Accuracy: 0.8146
Precision: 0.8095
Recall: 0.8239
Specificity: 0.8052
F1: 0.8167
---Other metrics regarding Random Forest---
Average tree depth: 234.87
--Train--
Accuracy: 0.9499
--Test--
Accuracy: 0.8225
Precision: 0.8179
Recall: 0.8309
Specificity: 0.8141
F1: 0.8243


Looks like TF-IDF doesn't help much either. Let's look at other methods.