# Project 3

## Part 2: Modeling

Model data for fun and profit.

### 0. Imports and Preliminaries

In [1]:
# imports
import pandas as pd
import numpy as np

# preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

# metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# cross-validation
from sklearn.model_selection import train_test_split, cross_val_score

# pipelines, gridsearch
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# custom
import ipynb_utils as ipyutils

In [2]:
# load data
df = pd.read_json('../data/scrapes-clean.json', orient='index')

# convert time to datetime object
df['time'] = pd.to_datetime(df['time'], format=ipyutils.DATE_FMT)

In [3]:
# check that all looks good...
df.head()

Unnamed: 0,time,title,body-text,title-cc,title-wc,body-cc,body-wc,media,comments
0,2022-02-01,Saturn Return MEGATHREAD - we've been getting ...,,214,35,0,0,0,330
1,2022-06-01,"MERCURY RX INFOGRAPHIC: Taurus/Gemini, Apr-Jun...",,51,8,0,0,0,22
2,2022-08-30,CHANI app issues?,I just downloaded the CHANI app to try out and...,17,3,221,40,0,4
4,2022-08-30,Is Mercury in Aquarius in the 6th House as pow...,Not new to the deeper parts of astrology but t...,86,17,314,56,0,8
5,2022-08-30,What is the proper orb for a sextile?,What is the proper and respective orb for a se...,37,7,224,42,0,8


In [4]:
# ... and that the right datatypes are showing
df.dtypes

time         datetime64[ns]
title                object
body-text            object
title-cc              int64
title-wc              int64
body-cc               int64
body-wc               int64
media                 int64
comments              int64
dtype: object

### 0.5. Problem Statement

What characteristics of a post on Reddit are most predictive of the overall interaction on a thread (as measured by number of comments)?

Model will predict whether or not a given Reddit post will have above or below the median number of comments.

### 1. Generate Target

In [5]:
# median comments
median = np.median(df['comments'])
median

26.0

In [6]:
# target column
df['comments_gt_median'] = (df['comments'] > median).astype(int)
df['comments_gt_median'].value_counts()

0    993
1    979
Name: comments_gt_median, dtype: int64

#### Baseline
Baseline is just about **50%** (we are using median).

### 1a. Split Time Column

Might want to check by month or day of week

In [7]:
df['day'] = df['time'].apply(ipyutils.get_day_of_week)

In [8]:
df['month'] = df['time'].apply(lambda x: x.month)

In [9]:
df[['day','month']].head()

Unnamed: 0,day,month
0,1,2
1,2,6
2,1,8
4,1,8
5,1,8


In [10]:
df['weekend'] = (df['day'] > 5).astype(int)
df['weekend'].head(10)

0     0
1     0
2     0
4     0
5     0
6     0
7     1
8     1
10    1
11    1
Name: weekend, dtype: int64

In [101]:
# Doing this just to be safe as I've gotten some weird row mismatches
# later on and not sure exactly why
df.reset_index(drop=True, inplace=True)

### 2. Train-Test Split

In [102]:
col_target = 'comments_gt_median'
cols_to_drop = ['time'] # don't need this any more
X = df.drop(columns=[col_target]+cols_to_drop)
y = df[col_target]

# split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=1)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1577, 11), (395, 11), (1577,), (395,))

### 3. Count Vectorize Text Fields

In [103]:
# get count vectorize tables
cv_min_df = 2
title_cv = CountVectorizer(token_pattern=ipyutils.PAT_TOKEN, min_df=cv_min_df)
body_cv = CountVectorizer(token_pattern=ipyutils.PAT_TOKEN, min_df=cv_min_df)
alltext_cv = CountVectorizer(token_pattern=ipyutils.PAT_TOKEN, min_df=cv_min_df)

# title
train_title_cv = title_cv.fit_transform(X_train['title'])
test_title_cv = title_cv.transform(X_test['title'])

# body
train_body_cv = body_cv.fit_transform(X_train['body-text'])
test_body_cv = body_cv.transform(X_test['body-text'])

# title + body
train_alltext_cv = alltext_cv.fit_transform(X_train['title'] + ' ' + X_train['body-text'])
test_alltext_cv = alltext_cv.transform(X_test['title'] + ' ' + X_test['body-text'])

(train_title_cv.shape, train_body_cv.shape, 
test_title_cv.shape, test_body_cv.shape,
train_alltext_cv.shape, test_alltext_cv.shape)

((1577, 1276),
 (1577, 4416),
 (395, 1276),
 (395, 4416),
 (1577, 4782),
 (395, 4782))

In [104]:
title_cv.get_feature_names_out()

array(['10th', '11', '11th', ..., 'zodiac', 'zodiacal', 'zodiacs'],
      dtype=object)

In [105]:
alltext_cv.get_feature_names_out()[-40:]

array(['writers', 'writing', 'written', 'wrong', 'wrote', 'wrought',
       'www', 'xo', 'xyz', 'yall', 'yang', 'yeah', 'year', 'yearly',
       'years', 'yes', 'yesterday', 'yet', 'yikes', 'yods', 'yoga',
       'york', 'you', 'young', 'younger', 'your', 'yours', 'yourself',
       'youtube', 'yr', 'yt', 'zealot', 'zeitgeist', 'zero', 'zeus',
       'zodiac', 'zodiacal', 'zodiacs', 'zone', 'zoom'], dtype=object)

### 3. Random Forest Classifier

In [106]:
title_rfc = RandomForestClassifier(n_jobs=-1, random_state=1)
gs_params = {
    'n_estimators': [200, 300],
    'min_samples_leaf': [4, 5],
    'min_samples_split': [4, 5],
    'min_impurity_decrease': [0.0001, 0.001]
}

# use gridsearch this time only to check best model params (takes a long time)
gs = GridSearchCV(title_rfc, gs_params, verbose=1, n_jobs=-1)

In [107]:
# model on titles only
gs.fit(train_title_cv, y_train)
print()
print(ipyutils.score_report(gs, 
                            (train_title_cv, y_train), 
                            (test_title_cv, y_test)))

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Model Train Score (best): 0.7285986049461002
Model Test Score (best): 0.6227848101265823
Model Best Estimator: RandomForestClassifier(min_impurity_decrease=0.0001, min_samples_leaf=5,
                       min_samples_split=4, n_estimators=300, n_jobs=-1,
                       random_state=1)



In [108]:
# save best params to use for later models
rfc_params = {
    'min_impurity_decrease': 0.0001,
    'min_samples_leaf': 5,
    'min_samples_split': 4,
    'n_estimators': 300,
    'n_jobs': -1,
    'random_state': 1
}

In [109]:
confusion_matrix(y_test, gs.predict(test_title_cv))

array([[133,  66],
       [ 83, 113]])

In [110]:
# model on body text only - use same best params from gridsearch
body_rfc = RandomForestClassifier(**rfc_params)
body_rfc.fit(train_body_cv, y_train)
print()
print(ipyutils.score_report(body_rfc, 
                            (train_body_cv, y_train), 
                            (test_body_cv, y_test)))


Model Train Score (best): 0.7381103360811667
Model Test Score (best): 0.6354430379746835



In [111]:
# model on all text
alltext_rfc = RandomForestClassifier(**rfc_params)
alltext_rfc.fit(train_alltext_cv, y_train)
print()
print(ipyutils.score_report(alltext_rfc, 
                            (train_alltext_cv, y_train), 
                            (test_alltext_cv, y_test)))


Model Train Score (best): 0.8199112238427394
Model Test Score (best): 0.6556962025316456



#### Analysis of Random Forest Classifier Score

Perhaps unsurprisingly, analyzing on the full text (body plus title) gave better prediction accuracy. However, for purposes of the problem statement, the title and body are possibly best kept separate, as reddit does not diplay the full body text by default, and searches only display titles.

Accuracy is better than baseline, but not by much. Gridsearch does not reveal too much about the possible model parameters - it just tells me that the more specific model scores better.

The model is overfit (which is probably to be expected from a decision-tree-based model).

### 3a. ExtraTrees Classifier

In [112]:
title_etc = ExtraTreesClassifier(n_jobs=-1, random_state=1)
# use same gs_params from random forest
title_etc_gs = GridSearchCV(title_etc, gs_params, verbose=1, n_jobs=-1)
title_etc_gs.fit(train_title_cv, y_train)
print(ipyutils.score_report(title_etc_gs,
                            (train_title_cv, y_train),
                            (test_title_cv, y_test)))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Model Train Score (best): 0.7805960684844642
Model Test Score (best): 0.6354430379746835
Model Best Estimator: ExtraTreesClassifier(min_impurity_decrease=0.0001, min_samples_leaf=4,
                     min_samples_split=4, n_estimators=200, n_jobs=-1,
                     random_state=1)



In [113]:
etc_params = {
    'min_impurity_decrease': 0.0001,
    'min_samples_leaf': 4,
    'min_samples_split': 4,
    'n_estimators': 200,
    'n_jobs': -1,
    'random_state': 1
}

In [114]:
body_etc = ExtraTreesClassifier(**etc_params)
body_etc.fit(train_body_cv, y_train)
print(ipyutils.score_report(body_etc,
                            (train_body_cv, y_train),
                            (test_body_cv, y_test)))

Model Train Score (best): 0.7913760304375397
Model Test Score (best): 0.660759493670886



In [115]:
alltext_etc = ExtraTreesClassifier(**etc_params)
alltext_etc.fit(train_alltext_cv, y_train)
print(ipyutils.score_report(alltext_etc,
                            (train_alltext_cv, y_train),
                            (test_alltext_cv, y_test)))

Model Train Score (best): 0.8883956880152187
Model Test Score (best): 0.6556962025316456



#### Analysis of Extra Trees Classifier Score

ExtraTrees did not fare any better than Random Forest on this data.

### 4. Other Classifiers (Quick Comparisons)

I am testing a number of other classifiers on the alltext set to see how they compare.

In [116]:
# Ada Boost
rfc = RandomForestClassifier(**rfc_params)
ada = AdaBoostClassifier(rfc, random_state=1)
ada.fit(train_alltext_cv, y_train)
ada.score(train_alltext_cv, y_train), ada.score(test_alltext_cv, y_test)

(0.9949270767279645, 0.6632911392405063)

In [117]:
# K Neighbors
knc = KNeighborsClassifier(5)
knc.fit(train_alltext_cv, y_train)
knc.score(train_alltext_cv, y_train), knc.score(test_alltext_cv, y_test)

(0.7076727964489538, 0.5645569620253165)

In [118]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(train_alltext_cv, y_train)
knc.score(train_alltext_cv, y_train), knc.score(test_alltext_cv, y_test)

(0.7076727964489538, 0.5645569620253165)

In [119]:
# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(train_alltext_cv, y_train)
mnb.score(train_alltext_cv, y_train), mnb.score(test_alltext_cv, y_test)

(0.7856689917564997, 0.6886075949367089)

### 4a. Other Features with Various Classifiers

There are a few other features I'd like to explore (word/character counts, for example).

Date/time features might not be appropriate here due to how Reddit works and the scraping process. Reddit no longer allows search by date, so I cannot get consecutive posts over time, and I am therefore trying to get as many posts I can via searches for words. Therefore, the post distribution over time that I get from my scrapes may not be the same as the actual post distribution over time, and there is no way to verify this with my current scraping process. I have therefore decided to ignore time factors.

In [120]:
df.columns

Index(['time', 'title', 'body-text', 'title-cc', 'title-wc', 'body-cc',
       'body-wc', 'media', 'comments', 'comments_gt_median', 'day', 'month',
       'weekend'],
      dtype='object')

In [121]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression())
])

cols = ['media', 'title-cc', 'body-cc']
pipe.fit(X_train[cols], y_train)
pipe.score(X_train[cols], y_train), pipe.score(X_test[cols], y_test)

(0.6011414077362079, 0.610126582278481)

In [122]:
ada = AdaBoostClassifier()
ada.fit(X_train[cols], y_train)
ada.score(X_train[cols], y_train), pipe.score(X_test[cols], y_test)

(0.6328471781864299, 0.610126582278481)