### 1. Data Preparation
* Imports
* Scraping top 5000 subreddit posts from r/AWS and r/AZURE subreddits
* Create a function to scrape data
* Save data into a csv file

### 1.1 Using pushshift.io

In [1]:
import requests
import pandas as pd
from datetime import datetime
import time

In [2]:
def get_posts(params, baseurl='https://api.pushshift.io/reddit/search/submission'):
    res = requests.get(baseurl, params)
    if res.status_code != 200:
        return f'Error code: {res.status_code}'
    else:
        posts = res.json()['data']
        df = pd.DataFrame(posts)
        return df

In [26]:

def scrape_reddit(subreddit, iterations):
    """
    Function to scrape subreddit in interations of 1000 posts
    subreddit: name of subreddit e.g. "aws"
    iterations: Number of times the API will be called e.g. 5 for 5000 posts
    """
    params = {
        'subreddit': subreddit, 
        'size': 1000
    }

    df = get_posts(params)
    
    for i in range(iterations-1):
        params = {
            'subreddit': subreddit, 
            'size': 1000, 
            'before': df.loc[(df.shape[0] - 1), 'created_utc'], 
        }
        df_add = get_posts(params)
        df = pd.concat([df, df_add], axis=0, ignore_index=True, sort=True)
    return df

df_aws = scrape_reddit('aws', 4)
df_aws.shape

(3998, 133)

In [27]:
df_az = scrape_reddit('azure', 4)
df_az.shape

(3998, 101)

In [24]:
selected_columns = ['subreddit','selftext', 'author', 'title', 'score','num_comments','utc_datetime_str','removed_by']
df_aws[selected_columns].head(3)

Unnamed: 0,subreddit,selftext,author,title,score,num_comments,utc_datetime_str,removed_by
0,aws,[removed],santamaps,Our Lambda / RDS app is a maintenance nightmar...,1,0,2023-03-17 13:38:29,
1,aws,[removed],YeNerdLifeChoseMe,Case Studies for actual high-availability in t...,1,0,2023-03-17 13:20:32,
2,aws,I have a custom lambda authorizer to authorize...,redditor_tx,How to authenticate WebSocket connections,1,0,2023-03-17 13:19:21,


In [30]:
df_aws.to_csv('../data/aws_subreddit.csv', index=False)

In [32]:
df_az.to_csv('../data/az_subreddit.csv', index=False)