# Project 3

## Part 2: Modeling

Model data for fun and profit.

### 0. Imports and Preliminaries

In [13]:
# imports
import pandas as pd
import numpy as np

# preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# cross-validation
from sklearn.model_selection import train_test_split, cross_val_score

# pipelines, gridsearch
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# custom
import ipynb_utils as ipyutils

In [2]:
# load data
df = pd.read_json('../data/scrapes-clean.json', orient='index')

# convert time to datetime object
df['time'] = pd.to_datetime(df['time'], format=ipyutils.DATE_FMT)

In [5]:
# check that all looks good...
df.head()

Unnamed: 0,time,title,body-text,title-cc,title-wc,body-cc,body-wc,media,comments
0,2022-02-01,Saturn Return MEGATHREAD - we've been getting ...,,214,36,0,0,0,330
1,2022-06-01,"MERCURY RX INFOGRAPHIC: Taurus/Gemini, Apr-Jun...",,51,8,0,0,0,22
2,2022-08-30,CHANI app issues?,I just downloaded the CHANI app to try out and...,17,3,221,50,0,4
4,2022-08-30,Is Mercury in Aquarius in the 6th House as pow...,Not new to the deeper parts of astrology but t...,86,17,314,59,0,8
5,2022-08-30,What is the proper orb for a sextile?,What is the proper and respective orb for a se...,37,8,224,45,0,8


In [6]:
# ... and that the right datatypes are showing
df.dtypes

time         datetime64[ns]
title                object
body-text            object
title-cc              int64
title-wc              int64
body-cc               int64
body-wc               int64
media                 int64
comments              int64
dtype: object

### 0.5. Problem Statement

What characteristics of a post on Reddit are most predictive of the overall interaction on a thread (as measured by number of comments)?

Model will predict whether or not a given Reddit post will have above or below the median number of comments.

### 1. Generate Target

In [10]:
# median comments
median = np.median(df['comments'])
median

26.0

In [12]:
# target column
df['comments_gt_median'] = (df['comments'] > median).astype(int)
df['comments_gt_median'].value_counts()

0    993
1    979
Name: comments_gt_median, dtype: int64

#### Baseline
Baseline is just about **50%** (we are using median).

### 2. Train-Test Split

In [27]:
col_target = 'comments_gt_median'
cols_to_drop = ['time']
X = df.drop(columns=[col_target]+cols_to_drop)
y = df[col_target]

# split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    random_state=1)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1479, 8), (493, 8), (1479,), (493,))

In [33]:
# get count vectorize tables
cv = CountVectorizer(token_pattern=ipyutils.PAT_TOKEN)

# title
train_title_cv = cv.fit_transform(X_train['title'])
test_title_cv = cv.transform(X_test['title'])

# body
train_body_cv = cv.fit_transform(X_train['body-text'])
test_body_cv = cv.transform(X_test['body-text'])

# title + body
alltext
train_alltext_cv = cv.fit_transform(X_train['title'] + ' ' + X_train['body-text'])
test_alltext_cv = cv.transform(X_test['title'] + ' ' + X_train['body-text'])

(train_title_cv.shape, train_body_cv.shape, 
 test_title_cv.shape, test_body_cv.shape,
 train_alltext_cv.shape, test_alltext_cv.shape)

ValueError: np.nan is an invalid document, expected byte or unicode string.

### 3. Random Forest Classifier