## Imports for Data Scraping

In [1]:
import numpy as np
import pandas as pd

import requests
import time
import datetime as dt

import praw
from psaw import PushshiftAPI
import datetime as dt

from bs4 import BeautifulSoup
import regex as re

## PSAW

https://github.com/dmarx/psaw

In [2]:
#instantiating the PushshiftApi
api = PushshiftAPI()

In [3]:
#telling it when to start looking for posts
start_epoch=int(dt.datetime(2017, 1, 1).timestamp())

#taking posts from
politics = list(api.search_submissions(after=start_epoch,
                            subreddit='politics',
                            filter=['url','author', 'title', 'subreddit'],
                            limit=30000))

scifi = list(api.search_submissions(after=start_epoch,
                            subreddit='scifi',
                            filter=['url','author', 'title', 'subreddit'],
                            limit=30000))

In [4]:
#putting the scrapes into dataframes
politics = pd.DataFrame(politics)
scifi    = pd.DataFrame(scifi)

In [5]:
#saving them to csv
politics.to_csv('./politics_scrape')
scifi.to_csv('./scifi_scrape')

## Saved Data

In [6]:
politics = pd.read_csv('./politics_scrape')

In [7]:
politics.title.head()

0    You’ll Never Believe It, but the Shutdown Is M...
1    Rand Paul headed to Canada for surgery, but wi...
2    Tiffany Trump Not Fazed Dating Someone Who Gre...
3    Why the power elite continues to dominate Amer...
4    Sen. Bernie Sanders says 'we will end' Big Pha...
Name: title, dtype: object

In [8]:
scifi = pd.read_csv('./scifi_scrape')

In [9]:
scifi.title.head()

0                                        New to Sci-Fi
1    Pushing Ice by Al Reynolds and Seveneves by Ne...
2      Any "Top lists" in this sub like in r/fantasy ?
3    You can watch Star Trek discovery with Kingon ...
4    An open-source rocket could reshape society in...
Name: title, dtype: object

## Data Cleaning

In [10]:
#combining them into one dataframe
df = pd.DataFrame.append(politics, scifi)

In [11]:
#removing duplicates
df = df[df.duplicated(subset='title',keep='first')==False]

In [12]:
#dropping unncessary columns
df.drop(columns=['url','author','d_','created_utc','created','Unnamed: 0'],axis=1,inplace=True)

In [13]:
#binarizing the subreddit column so 1 is politics and 0 is science
df['subreddit'] = np.where(df['subreddit'] == 'politics', 1, 0)

In [14]:
def cleaning_titles (title):
    
    # remove non-letters        
    just_letters = re.sub("[^a-zA-Z]", " ", title)

    #make lowercase
    lower_letters = just_letters.lower()
    
    return  lower_letters

In [15]:
clean_titles = []

for i in df['title']:
    clean_titles.append(cleaning_titles(i))

df['title'] = clean_titles

In [16]:
df.head()

Unnamed: 0,subreddit,title
0,1,you ll never believe it but the shutdown is m...
1,1,rand paul headed to canada for surgery but wi...
2,1,tiffany trump not fazed dating someone who gre...
3,1,why the power elite continues to dominate amer...
4,1,sen bernie sanders says we will end big pha...


## Vectorizing Data and Train/Test Split

In [17]:
#imports

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.model_selection import train_test_split

In [18]:
# setting X and y variables
X    = df.drop(columns='subreddit',axis=1)
y    = df.drop(columns='title',axis=1)

In [19]:
#checking shape
print(X.shape)
print(y.shape)

(40964, 1)
(40964, 1)


In [20]:
#train, test split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size    = .3,
                                                    random_state = 42,
                                                    stratify     = y,
                                                    shuffle      = True)

In [21]:
#initializing count vectorizer
vect = CountVectorizer(stop_words='english',
                      ngram_range=(1,10),
                      max_features=5000)

In [22]:
X_train_vect = vect.fit_transform(X_train['title'])
X_test_vect  = vect.transform(X_test['title'])

In [23]:
print(X_train_vect.shape)
print(X_test_vect.shape)
print(y_train.shape)
print(y_test.shape)

(28674, 5000)
(12290, 5000)
(28674, 1)
(12290, 1)


## Fitting a Model

In [24]:
#importing models and metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score

  from numpy.core.umath_tests import inner1d


In [30]:
#fitting random forest
rf = RandomForestClassifier(n_estimators    = 50,
                            max_features    = 500,
                            max_depth       = 30, 
                            n_jobs          = -1)

rf_model = rf.fit(X_train_vect, y_train)

#fitting logistic regression

log = LogisticRegression(random_state = 42)
model = log.fit(X_train_vect, y_train)

print('Logistic Regression w/ Count Vectorizer Train score:',        model.score(X_train_vect, y_train))
print('Logistic Regression w/ Count Vectorizer Test score:' ,        model.score(X_test_vect,  y_test))
print('Number of features:', X_train_vect.shape[1] )
print('')
print('Random Forest w/ Count Vectorizer Train score:',        rf_model.score(X_train_vect, y_train))
print('Random Forest w/ Count Vectorizer Test score:' ,        rf_model.score(X_test_vect,  y_test))
print('Number of features:', X_train_vect.shape[1] )

  import sys
  y = column_or_1d(y, warn=True)


Logistic Regression w/ Count Vectorizer Train score: 0.980923484689963
Logistic Regression w/ Count Vectorizer Test score: 0.9613506916192026
Number of features: 5000

Random Forest w/ Count Vectorizer Train score: 0.8470391295250053
Random Forest w/ Count Vectorizer Test score: 0.8397070789259561
Number of features: 5000


## TF-IDF and Hashing

In [26]:
#imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [27]:
#instantiating TFIDF
vect_tfidf = TfidfVectorizer(stop_words='english',ngram_range=(1,10),max_features=5000)

X_train_vect_tfidf = vect_tfidf.fit_transform(X_train['title'])
X_test_vect_tfidf  = vect_tfidf.transform(X_test['title'])

#fitting to random forest
rf = RandomForestClassifier(n_estimators    = 50,
                            max_features    = 200,
                            max_depth       = 30, 
                            n_jobs          = -1)
rf.fit(X_train_vect_tfidf, y_train)

#fitting to logistic regression
log = LogisticRegression(random_state = 42)
model = log.fit(X_train_vect_tfidf, y_train)

print('Logistic Regression w/ TFIDF Train score:',        model.score(X_train_vect_tfidf, y_train))
print('Logistic Regression w/ TFIDF Test score:' ,        model.score(X_test_vect_tfidf,  y_test))
print('Number of features:', X_train_vect_tfidf.shape[1] )
print('')
print('Random Forest w/ TFIDF Train score:',        rf.score(X_train_vect_tfidf,y_train))
print('Random Forest w/ TFIDF Test score:',         rf.score(X_test_vect_tfidf,y_test))
print('Number of features:', X_train_vect_tfidf.shape[1] )

  if sys.path[0] == '':
  y = column_or_1d(y, warn=True)


Logistic Regression w/ TFIDF Train score: 0.976250261560996
Logistic Regression w/ TFIDF Test score: 0.9624084621643613
Number of features: 5000

Random Forest w/ TFIDF Train score: 0.8592104345400013
Random Forest w/ TFIDF Test score: 0.8523189585028479
Number of features: 5000


In [28]:
#most valuable features from TFIDF
sorted(list(zip(rf.feature_importances_,vect_tfidf.get_feature_names())),reverse=True)[:10]

[(0.21074187305325398, 'trump'),
 (0.053298702449732956, 'sci fi'),
 (0.05243591851048748, 'sci'),
 (0.04753360836923412, 'fi'),
 (0.03410735932958589, 'scifi'),
 (0.03146243585317323, 'sps'),
 (0.03076854205762591, 'shutdown'),
 (0.027191348855218837, 'space'),
 (0.026672902793294314, 'star'),
 (0.018856504899946377, 'fiction')]

In [29]:
#instantiating hashing vectorizer
vect_hash = HashingVectorizer(stop_words='english',ngram_range=(1,10),n_features=5000)

X_train_vect_hash = vect_hash.fit_transform(X_train['title'])
X_test_vect_hash = vect_hash.transform(X_test['title'])

#fitting to random forest
rf = RandomForestClassifier(n_estimators    = 50,
                            max_features    = 200,
                            max_depth       = 30, 
                            n_jobs          = -1)
rf.fit(X_train_vect_hash, y_train)

#fitting to logistic regression
log = LogisticRegression(random_state = 42)
model = log.fit(X_train_vect_hash, y_train)

print('Logistic Regression w/ Hash Vectorizer Train score:',        model.score(X_train_vect_hash, y_train))
print('Logistic Regression w/ Hash Vectorizer Test score:' ,        model.score(X_test_vect_hash,  y_test))
print('Number of features:', X_train_vect_hash.shape[1] )
print('')
print('Random Forest w/ Hash Vectorizer Train score:',        rf.score(X_train_vect_hash,y_train))
print('Random Forest w/ Hash Vectorizer Test score:',         rf.score(X_test_vect_hash,y_test))
print('Number of features:', X_train_vect_hash.shape[1] )

  if sys.path[0] == '':
  y = column_or_1d(y, warn=True)


Logistic Regression w/ Hash Vectorizer Train score: 0.9364581153658367
Logistic Regression w/ Hash Vectorizer Test score: 0.9006509357200977
Number of features: 5000

Random Forest w/ Hash Vectorizer Train score: 0.8587570621468926
Random Forest w/ Hash Vectorizer Test score: 0.8304312449145647
Number of features: 5000
