# Web Scraping

## Introduction

In this project, I scrape title and post data from the `/r/TwoSentenceHorror/` and `/r/TwoSentenceComedy/` subreddits and try to classify them.

In [3]:
# Basic Data Analysis Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Web-scraping
import requests
from bs4 import BeautifulSoup
import re

In [4]:
# urls

# push_shift_url = 

url_horror = r'https://api.pushshift.io/reddit/search/submission?subreddit=TwoSentenceHorror'
url_comedy = r'https://api.pushshift.io/reddit/search/submission?subreddit=TwoSentenceComedy'

In [5]:
f'{url_horror}&size=50'

'https://api.pushshift.io/reddit/search/submission?subreddit=TwoSentenceHorror&size=50'

In [94]:
# get_request is a function that takes a subreddit and returns 
# a dataframe containing the subreddit name, post title, and post body

def get_request(subreddit, size = 25):
    
    
    if size != 25:
        url = f'https://api.pushshift.io/reddit/search/submission?subreddit={subreddit}&size={size}&score>=100'
    else:
        url = f'https://api.pushshift.io/reddit/search/submission?subreddit={subreddit}&score>=100'

    dfs = []
    
    for timeframe in [f'&after={n+30}d&before={n}d' for n in range(30,750,30)]:
        loop_url = f'{url}{timeframe}'
        req = requests.get(loop_url) # Request URL
        req_stat = req.status_code # get request status
        print(req_stat)

        if req_stat == 200:
            results = req.json()
            results_df = pd.DataFrame(results['data'])

            dfs.append(results_df.loc[:,['subreddit', 'title', 'selftext']])

        else:
            print(f'Request failed;\n request status: {req_stat}\n Invalid URL: {url}')
            
    return pd.concat(dfs, ignore_index = True)

In [95]:
subreddits = ['TwoSentenceHorror','TwoSentenceComedy']

subreddits_dfs = {subreddit: get_request(subreddit, size = 100) for subreddit in subreddits}

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


In [76]:
subreddits_dfs['TwoSentenceHorror']

Unnamed: 0,subreddit,title,selftext
0,TwoSentenceHorror,I've always hated clowns. As they gather close...,
1,TwoSentenceHorror,"Every day for three years now, I have looked i...",
2,TwoSentenceHorror,"I had out run them, they had to be miles away ...",
3,TwoSentenceHorror,I woke up dreaming. Will I ever sleep?,
4,TwoSentenceHorror,Then it happened. She was pregnant.,
...,...,...,...
2395,TwoSentenceHorror,Shadows help them out. You just notice it some...,
2396,TwoSentenceHorror,I let my daughter sleep in my bed at night.,"I still like to snuggle with her, despite the ..."
2397,TwoSentenceHorror,My best joke.,What's worse than a worm in your apple?\n\nMe.
2398,TwoSentenceHorror,I feared looking out the window,"I feared if I looked out the window, I'd see g..."


In [77]:
subreddits_dfs['TwoSentenceComedy']

Unnamed: 0,subreddit,title,selftext
0,TwoSentenceComedy,Welcome to TwoSentenceComedy!,We have 5 main rules here so let's get started...
1,TwoSentenceComedy,Shdnt,Test
2,TwoSentenceComedy,Test,Test
3,TwoSentenceComedy,Test,Test
4,TwoSentenceComedy,Just been caught snorting scarlet poster paint.,Boy is my face red!
...,...,...,...
2106,TwoSentenceComedy,I just heard about a tribe of people who worsh...,"I had to ask myself, is nothing sacred?"
2107,TwoSentenceComedy,The school was on a lockdown and the killer wa...,Somebody farted.
2108,TwoSentenceComedy,A Spanish magician is going to dissapear. He s...,and dissapears without a tres.
2109,TwoSentenceComedy,"Guys, the sub is called TWO sentence comedy.","Try to keep your posts 2 sentences, not one lo..."


In [96]:
dfs = pd.concat(subreddits_dfs.values(), ignore_index = True)

In [97]:
dfs.head()

Unnamed: 0,subreddit,title,selftext
0,TwoSentenceHorror,"I was watching a movie with my 5 year old son,...",
1,TwoSentenceHorror,"“You know, I’ve never bungee jumped before– ma...","The noise of the wind, I hoped, was loud enoug..."
2,TwoSentenceHorror,I gently put down my baby in his crib before I...,"They had put out an amber alert for my child, ..."
3,TwoSentenceHorror,My cat has a very annoying habit of shoving hi...,"I never understood it until today, when he man..."
4,TwoSentenceHorror,I shuddered as I heard the screams coming from...,Why do my parents insist on killing them at th...


In [98]:
df_drop = dfs.drop(dfs.loc[dfs.loc[:,'title'].duplicated(),:].index)

In [99]:
df_drop.shape

(3606, 3)

In [102]:
df_drop.loc[:,'text'] = df_drop.loc[:,'title'] + ' ' + df_drop.loc[:,'selftext']
df_drop.head()

Unnamed: 0,subreddit,title,selftext,text
0,TwoSentenceHorror,"I was watching a movie with my 5 year old son,...",,"I was watching a movie with my 5 year old son,..."
1,TwoSentenceHorror,"“You know, I’ve never bungee jumped before– ma...","The noise of the wind, I hoped, was loud enoug...","“You know, I’ve never bungee jumped before– ma..."
2,TwoSentenceHorror,I gently put down my baby in his crib before I...,"They had put out an amber alert for my child, ...",I gently put down my baby in his crib before I...
3,TwoSentenceHorror,My cat has a very annoying habit of shoving hi...,"I never understood it until today, when he man...",My cat has a very annoying habit of shoving hi...
4,TwoSentenceHorror,I shuddered as I heard the screams coming from...,Why do my parents insist on killing them at th...,I shuddered as I heard the screams coming from...


In [103]:
df_drop.loc[:,['subreddit','text']].to_csv('./datasets/data_1.csv', index = False)

In [93]:
df_drop.subreddit.value_counts(normalize = True)

TwoSentenceHorror    0.510204
TwoSentenceComedy    0.489796
Name: subreddit, dtype: float64

In [16]:
df_drop.loc[:,'text'] = df_drop.loc[:,'title'] + ' ' + df_drop.loc[:,'selftext']

In [18]:
df_drop.loc[:,['subreddit', 'text']]

Unnamed: 0,subreddit,text
0,TwoSentenceHorror,I've always heard cross-country road trips wer...
1,TwoSentenceHorror,"""Who's there?"" came the sweet voice of my gran..."
2,TwoSentenceHorror,I took the elevator with some random strangers...
3,TwoSentenceHorror,"""Gotta stop and check the herd real quick,"" sa..."
4,TwoSentenceHorror,He refused to give me his number or his name a...
...,...,...
495,TwoSentenceComedy,"I, for one, never pay attention to “fake inter..."
496,TwoSentenceComedy,I charged out with my mighty sword and attacke...
497,TwoSentenceComedy,"“You should hit me up sometime,” my crush said..."
498,TwoSentenceComedy,As the Lady in white slowly approached me I wa...
