# Modeling

In [21]:
import requests
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix 
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

In [22]:
reddit = pd.read_csv('../datasets/reddit_final.csv')
reddit = reddit.drop(columns=['Unnamed: 0']) 

In [23]:
reddit.head(3)

Unnamed: 0,subreddit,selftext,gilded,title,subreddit_name_prefixed,hide_score,upvote_ratio,total_awards_received,is_reddit_media_domain,score,author_premium,edited,author_flair_richtext,is_self,author_flair_type,domain,allow_live_comments,archived,no_follow,is_crosspostable,over_18,awarders,can_gild,locked,treatment_tags,is_robot_indexable,num_comments,send_replies,author_patreon_flair,subreddit_subscribers,created_utc,num_crossposts,retrieved_utc,updated_utc,author_cakeday,subreddit_id_t5_2r0cn,removed_by_automod_filtered,removed_by_deleted,removed_by_reddit,removed_by_nan,thumbnail_nsfw,thumbnail_self,gildings_{},title_word_count,selftext_word_count,subreddit_id_t5_2txi0n,subreddit_id_t5_37roo,subreddit_id_t5_5iegdf,subreddit_id_t5_62obsy,subreddit_id_t5_6anqhn,subreddit_id_t5_6r00uj,subreddit_type_restricted,subreddit_type_user,thumbnail_other,gildings_'gid_2': 0,gildings_'gid_3': 0},gildings_{'gid_1': 0,gildings_{'gid_1': 1},gildings_{'gid_1': 4}
0,1,['none'],0,"['Im', 'f', 'love', 'fiancés', 'm', 'best', 'f...",1.0,1.0,1.0,0,1,1,1.0,1,1,1,1,1,1.0,1,1,1,1,1,1,0,1,1,1,1,1,8871630,1682548523,0,1682548537,1682548538,0,1.0,0.0,0,0.0,0,0,1,1,10,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,['none'],0,"['relationship', 'run', 'course']",1.0,1.0,1.0,0,1,1,1.0,1,1,1,1,1,1.0,1,1,1,1,1,1,0,1,1,1,1,1,8871630,1682548522,0,1682548537,1682548538,0,1.0,0.0,0,0.0,0,0,1,1,6,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,"['TLDR', 'mother', 'want', 'go', 'contact', 'a...",0,"['mom', 'f', 'want', 'm', 'cut', 'contact', 'a...",1.0,1.0,1.0,0,1,1,1.0,1,1,1,1,1,1.0,1,1,0,1,1,1,1,1,0,1,1,1,8871627,1682548515,0,1682548528,1682548529,0,1.0,0.0,0,0.0,1,0,1,1,15,700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
reddit.shape

(5998, 59)

## Train-Test-Split

In [25]:
X = reddit[[col for col in reddit.columns if col != 'subreddit']]
y = reddit[['subreddit']]

In [26]:
X['subreddit'] = reddit['subreddit']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [28]:
X_train.isna().sum()

selftext                       0
gilded                         0
title                          0
subreddit_name_prefixed        1
hide_score                     0
upvote_ratio                   0
total_awards_received          0
is_reddit_media_domain         0
score                          0
author_premium                 0
edited                         0
author_flair_richtext          0
is_self                        0
author_flair_type              0
domain                         0
allow_live_comments            0
archived                       0
no_follow                      0
is_crosspostable               0
over_18                        0
awarders                       0
can_gild                       0
locked                         0
treatment_tags                 0
is_robot_indexable             0
num_comments                   0
send_replies                   0
author_patreon_flair           0
subreddit_subscribers          0
created_utc                    0
num_crossp

In [29]:
null_indexes = X_train['subreddit_name_prefixed'].isnull()
X_train[null_indexes]

Unnamed: 0,selftext,gilded,title,subreddit_name_prefixed,hide_score,upvote_ratio,total_awards_received,is_reddit_media_domain,score,author_premium,edited,author_flair_richtext,is_self,author_flair_type,domain,allow_live_comments,archived,no_follow,is_crosspostable,over_18,awarders,can_gild,locked,treatment_tags,is_robot_indexable,num_comments,send_replies,author_patreon_flair,subreddit_subscribers,created_utc,num_crossposts,retrieved_utc,updated_utc,author_cakeday,subreddit_id_t5_2r0cn,removed_by_automod_filtered,removed_by_deleted,removed_by_reddit,removed_by_nan,thumbnail_nsfw,thumbnail_self,gildings_{},title_word_count,selftext_word_count,subreddit_id_t5_2txi0n,subreddit_id_t5_37roo,subreddit_id_t5_5iegdf,subreddit_id_t5_62obsy,subreddit_id_t5_6anqhn,subreddit_id_t5_6r00uj,subreddit_type_restricted,subreddit_type_user,thumbnail_other,gildings_'gid_2': 0,gildings_'gid_3': 0},gildings_{'gid_1': 0,gildings_{'gid_1': 1},gildings_{'gid_1': 4},subreddit
3028,['none'],0,"['asshole', 'quit', 'job']",,0.0,1.0,0,1,1,0.0,1,0,1,0,0,1.0,0,1,1,1,0,0,1,1,1,0,1,0,0,1594916801,0,1622903836,1679805880,0,0.0,0.0,1,0.0,0,0,0,1,8,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0


I believe this post is from the AITA subreddit, so I will fill the NaN value with 0

In [30]:
X_train['subreddit_name_prefixed'] = X_train['subreddit_name_prefixed'].fillna(0)

## Vectorizing  

In [31]:
tfidf = TfidfVectorizer()

In [32]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

## Preparing for RFE 

In [33]:
X_train_tfidf.shape

(59, 57)

In [34]:
y_train.shape

(4198, 1)

In [35]:
# Subset y_train to match the number of samples in X_train_tfidf
# RFE will not work due to ValueError: Found input variables with inconsistent numbers of samples: [59, 4198]

y_train_subset = y_train[:59]

In [36]:
# Set y_train_subset to array to ravel to get rid of DataConversionWarning: A column-vector 
# y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), 
# for example using ravel().

y_train_np = np.array(y_train_subset).ravel()

In [37]:
X_train_subset = X_train_tfidf[:59]

In [52]:
X_train_tfidf = reddit[reddit.columns.tolist()]

## RFE 

In [50]:
logreg = LogisticRegression()
rfe = RFE(logreg, n_features_to_select=40)
rfe_fit = rfe.fit(X_train_tfidf, y_train_np)

In [54]:
rfe_df = pd.DataFrame(columns=['Column', 'Ranking'])
#rfe_df['Column'] = X_train_tfidf.columns
rfe_df['Ranking'] = rfe_fit.ranking_

In [42]:
rfe_df.sort_values(by='Ranking', ascending=False).head(20)

Unnamed: 0,Column,Ranking
12,,18
14,,17
13,,16
0,,15
1,,14
2,,13
56,,12
30,,11
47,,10
21,,9


In [57]:
X.columns

Index(['selftext', 'gilded', 'title', 'subreddit_name_prefixed', 'hide_score',
       'upvote_ratio', 'total_awards_received', 'is_reddit_media_domain',
       'score', 'author_premium', 'edited', 'author_flair_richtext', 'is_self',
       'author_flair_type', 'domain', 'allow_live_comments', 'archived',
       'no_follow', 'is_crosspostable', 'over_18', 'awarders', 'can_gild',
       'locked', 'treatment_tags', 'is_robot_indexable', 'num_comments',
       'send_replies', 'author_patreon_flair', 'subreddit_subscribers',
       'created_utc', 'num_crossposts', 'retrieved_utc', 'updated_utc',
       'author_cakeday', 'subreddit_id_t5_2r0cn',
       'removed_by_automod_filtered', 'removed_by_deleted',
       'removed_by_reddit', 'removed_by_nan', 'thumbnail_nsfw',
       'thumbnail_self', 'gildings_{}', 'title_word_count',
       'selftext_word_count', 'subreddit_id_t5_2txi0n',
       'subreddit_id_t5_37roo', 'subreddit_id_t5_5iegdf',
       'subreddit_id_t5_62obsy', 'subreddit_id_t5_6anq

In [56]:
reddit.columns

Index(['subreddit', 'selftext', 'gilded', 'title', 'subreddit_name_prefixed',
       'hide_score', 'upvote_ratio', 'total_awards_received',
       'is_reddit_media_domain', 'score', 'author_premium', 'edited',
       'author_flair_richtext', 'is_self', 'author_flair_type', 'domain',
       'allow_live_comments', 'archived', 'no_follow', 'is_crosspostable',
       'over_18', 'awarders', 'can_gild', 'locked', 'treatment_tags',
       'is_robot_indexable', 'num_comments', 'send_replies',
       'author_patreon_flair', 'subreddit_subscribers', 'created_utc',
       'num_crossposts', 'retrieved_utc', 'updated_utc', 'author_cakeday',
       'subreddit_id_t5_2r0cn', 'removed_by_automod_filtered',
       'removed_by_deleted', 'removed_by_reddit', 'removed_by_nan',
       'thumbnail_nsfw', 'thumbnail_self', 'gildings_{}', 'title_word_count',
       'selftext_word_count', 'subreddit_id_t5_2txi0n',
       'subreddit_id_t5_37roo', 'subreddit_id_t5_5iegdf',
       'subreddit_id_t5_62obsy', 'subredd

In [58]:
[i for i in X if i not in reddit] 

[]

In [59]:
[i for i in reddit if i not in X]

[]

## Baseline 

In [63]:
baseline_accuracy = reddit['subreddit'].value_counts(normalize=True).max()
baseline_accuracy

0.5001667222407469

In [65]:
y_train.subreddit.value_counts(normalize=True)

0    0.500238
1    0.499762
Name: subreddit, dtype: float64

The baseline model predicts AITA because it has more posts, and is accurate around 50.01% of the time 