# API and Data Gathering

In [1]:
import requests
import pandas as pd
import numpy as np

In [2]:
# Function go pull data for whatever subreddit I want
def pull(subreddit):
    url = 'https://api.pushshift.io/reddit/search/submission'    # set url as pushapi
    
    # set params, chose minimum number of comments to weed out some weak posts
    params = {
        'subreddit':subreddit,
        'size':500,
        'num_comments': '=>10'
    }
    
    # use request to pull data from reddit
    res = requests.get(url, params)
    print(res.status_code)
    
    # set posts as the data portion of the pull
    posts = res.json()['data']
    print(len(posts))
    
    # create dataframe from posts
    df = pd.DataFrame(posts)
    
    # set utc as the date of the last post
    utc = posts[-1]['created_utc']
    
    # change parameters to pull 500 more older than the oldest post from the previous pull
    params = {
        'subreddit':subreddit,
        'size':500,
        'before':int(utc),
        'num_comments': '=>10'
    }
    
    # use request to pull data from reddit
    res = requests.get(url, params)
    print(res.status_code)
    
    # set posts as the data portion of the pull    
    posts = res.json()['data']
    print(len(posts))

    # create dataframe from posts
    df2 = pd.DataFrame(posts)
    
    # append the second dataframe to the first, then make a copy containing only the columns we want
    df = df.append(df2)
    df = df[['subreddit','selftext', 'title']].copy()
    
    return df
    
    

In [3]:
# use function to create dataframe
df_python = pull('Python')

200
500
200
500


In [4]:
df_python.head()

Unnamed: 0,subreddit,selftext,title
0,Python,"I want to open multiple SSH session, one to my...",Paramiko Multiple SSH session
1,Python,,"[100% OFF] Decision Trees, Random Forests, Ada..."
2,Python,I have to write a script for 10 children in cl...,Need some help from the smart people .
3,Python,,I made Tinder Bot written in Python Selenium
4,Python,Streamlit: [https://www.streamlit.io/](https:/...,Is it possible to host a streamlit app on Vercel?


In [5]:
df_python.duplicated(subset='title').value_counts()

False    959
True      41
dtype: int64

In [6]:
df_python.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 499
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   subreddit  1000 non-null   object
 1   selftext   996 non-null    object
 2   title      1000 non-null   object
dtypes: object(3)
memory usage: 31.2+ KB


# For r/golang

In [7]:
df_go = pull('golang')

200
500
200
500


In [8]:
df_go.head()

Unnamed: 0,subreddit,selftext,title
0,golang,I'm trying to download a set of images to my s...,Speeding up image download to file using gorou...
1,golang,My team has written a wrapper package for the ...,Mocking mongo-go-driver Find()
2,golang,,Sorting in Go - Don't Reinvent This Wheel
3,golang,"Hello everyone! I'm trying to learn go, and to...",How to print nicely a nested struct
4,golang,,SDNS v1.0.0 🎉released. Performance improvement...


In [9]:
df_go.duplicated(subset='title').value_counts()

False    984
True      16
dtype: int64

# Append dem and rep dataframes and save to .csv


In [14]:
df = df_python.append(df_go)

df.title = df.title.str.lower()
df.selftext = df.selftext.str.lower()
df.rename(columns={'subreddit':'python'}, inplace=True)
df.python = df.python.map({'Python':1, 'golang':0})

df.head()

Unnamed: 0,python,selftext,title
0,1,"i want to open multiple ssh session, one to my...",paramiko multiple ssh session
1,1,,"[100% off] decision trees, random forests, ada..."
2,1,i have to write a script for 10 children in cl...,need some help from the smart people .
3,1,,i made tinder bot written in python selenium
4,1,streamlit: [https://www.streamlit.io/](https:/...,is it possible to host a streamlit app on vercel?


In [15]:
df.selftext = np.where(df.selftext.isnull() == True, '', df.selftext)

df.selftext.isnull().value_counts()

False    2000
Name: selftext, dtype: int64

In [18]:
# df.to_csv('./data/python.csv', index=False)