In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import requests
from bs4 import BeautifulSoup
import regex as re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import time

## Reading

In [2]:
# Sorts by date 
fitfirst500 = 'https://api.pushshift.io/reddit/search/submission?subreddit=fitness&size=500'
fit30first500 = 'https://api.pushshift.io/reddit/search/submission?subreddit=fitness30plus&size=500'
# Fit second 500 is before 2 days ago
fitsecond500 = 'https://api.pushshift.io/reddit/search/submission?subreddit=fitness&before=2d&size=500'
# Fit30 second 500 is before 2 months ago
fit30second500 = 'https://api.pushshift.io/reddit/search/submission?subreddit=fitness30plus&before=63d&size=500'

In [3]:
resfit1 = requests.get(fitfirst500)
resfit2 = requests.get(fitsecond500)
resfit301 = requests.get(fit30first500)
resfit302 = requests.get(fit30second500)

In [4]:
# JSON
resultsfit1 = resfit1.json()
resultsfit2 = resfit2.json()
resultsfit301 = resfit301.json()
resultsfit302 = resfit302.json()

In [5]:
print(len(resultsfit1['data']), len(resultsfit2['data']))
print(len(resultsfit301['data']), len(resultsfit302['data']))

500 500
500 500


In [6]:
# Create DFs
fit1_df = pd.DataFrame(resultsfit1['data'])
fit2_df = pd.DataFrame(resultsfit2['data'])
fit301_df = pd.DataFrame(resultsfit301['data'])
fit302_df = pd.DataFrame(resultsfit302['data'])

In [7]:
# Concat the 500s
# "Sort" is to prevent future warning
fit_df = pd.concat([fit1_df, fit2_df], ignore_index = True, sort = True)
fit30_df = pd.concat([fit301_df, fit302_df], ignore_index = True, sort = True)

### Addressing Duplicates

In [8]:
# Fit: Dropped 77 Dups
fit_df.drop_duplicates(subset = ['title', 'selftext'], inplace = True)
fit_df.shape

(997, 65)

In [9]:
# Fit30: Dropped 38 Dups
fit30_df.drop_duplicates(subset = ['title', 'selftext'], inplace = True)
fit30_df.shape

(962, 76)

To pull replacement posts, we will go back 5 days for fit, and 120 days for fit30.

In [10]:
# Get replacements
fitreplace = 'https://api.pushshift.io/reddit/search/submission?subreddit=fitness&before=5d&size=77'
fit30replace = 'https://api.pushshift.io/reddit/search/submission?subreddit=fitness30plus&before=120d&size=38'

In [11]:
resfit = requests.get(fitreplace)
resfit30 = requests.get(fit30replace)

In [12]:
# JSON
resultsfit = resfit.json()
resultsfit30 = resfit30.json()
print(len(resultsfit['data']), len(resultsfit30['data']))

77 38


In [13]:
# Replacement DFs
fitreplace_df = pd.DataFrame(resultsfit['data'])
fit30replace_df = pd.DataFrame(resultsfit30['data'])

In [14]:
# Concat Replacements
fit_df = pd.concat([fit_df, fitreplace_df], ignore_index = True, sort = True)
fit30_df = pd.concat([fit30_df, fit30replace_df], ignore_index = True, sort = True)

In [15]:
# Concat the 2 subs
df = pd.concat([fit_df, fit30_df], ignore_index = True, sort = True)

## Cleaning

In [16]:
# Clean the datetime 
df['created_utc'] = pd.to_datetime(df['created_utc'], unit = 's')

In [17]:
# Clean the '[removed]'
# Clean the NaNs
df['selftext'].replace('[removed]', '', inplace = True)
df['selftext'].fillna('', inplace = True)

In [18]:
# Combine title & selftext
df['alltext'] = df['title'] + ' ' + df['selftext']

In [19]:
# Sort by date
df = df.sort_values(by = 'created_utc', ascending = False)

In [20]:
# MAP SUBREDDIT
# 0 for Fitness
# 1 for fitness30plus
df['subreddit'] = df['subreddit'].map({'Fitness': 0, 'fitness30plus': 1})

In [21]:
pd.set_option('display.max_rows', 2_000)
cols = ['title', 'selftext', 'alltext', 'subreddit', 'created_utc']
df[cols].head()

Unnamed: 0,title,selftext,alltext,subreddit,created_utc
0,the best exercises for skinny legs?,,the best exercises for skinny legs?,0,2020-01-28 20:49:17
1,Lifting &amp; Swimming for building mass?,,Lifting &amp; Swimming for building mass?,0,2020-01-28 20:42:41
2,Are arm sleeves a thing in fitness?,,Are arm sleeves a thing in fitness?,0,2020-01-28 20:40:46
3,"Muscle builders of Reddit, what are some recom...",,"Muscle builders of Reddit, what are some recom...",0,2020-01-28 20:38:16
4,What are some really helpful features that can...,,What are some really helpful features that can...,0,2020-01-28 20:33:23


Save the CSV so we're using the same posts every time in modeling.

In [22]:
df.to_csv('./datasets/df.csv')