### 1. Data Preparation
* Imports
* Scraping top 5000 subreddit posts from r/AWS and r/AZURE subreddits
* Create a function to scrape data
* Save data into a csv file

### 1.1 Using pushshift.io

In [1]:
import requests
import pandas as pd
from datetime import datetime
import time

In [2]:
def get_posts(params, baseurl='https://api.pushshift.io/reddit/search/submission'):
    res = requests.get(baseurl, params)
    if res.status_code != 200:
        return f'Error code: {res.status_code}'
    else:
        posts = res.json()['data']
        df = pd.DataFrame(posts)
        return df

In [8]:

def scrape_reddit(subreddit, iterations):
    """
    Function to scrape subreddit in interations of 1000 posts
    subreddit: name of subreddit e.g. "aws"
    iterations: Number of times the API will be called e.g. 5 for 5000 posts
    """
    params = {
        'subreddit': subreddit, 
        'size': 1000
    }

    df = get_posts(params)
    
    for i in range(iterations-1):
        params = {
            'subreddit': subreddit, 
            'size': 1000, 
            'before': df.loc[(df.shape[0] - 1), 'created_utc'], 
        }
        df_add = get_posts(params)
        df = pd.concat([df, df_add], axis=0, ignore_index=True, sort=True)
    return df

df_aws = scrape_reddit('aws', 5)
df_aws.shape

(4998, 135)

In [9]:
df_az = scrape_reddit('azure', 5)
df_az.shape

(4996, 106)

In [10]:
selected_columns = ['subreddit','selftext', 'author', 'title', 'score','num_comments','utc_datetime_str','removed_by']
df_aws[selected_columns].head(3)

Unnamed: 0,subreddit,selftext,author,title,score,num_comments,utc_datetime_str,removed_by
0,aws,[removed],BrianPRegan,AWS Pricing Add-on for Google Sheets,1,0,2023-03-10 16:28:11,
1,aws,[removed],Winter_Sucks_7868,Kansas AWS,1,0,2023-03-10 15:52:40,
2,aws,We have a site to site VPN between our AWS and...,silicondt,VPN - dynamic - can we put one static also?,1,0,2023-03-10 15:20:34,


In [11]:
df_aws.to_csv('../data/aws_subreddit.csv', index=False)

In [12]:
df_az.to_csv('../data/az_subreddit.csv', index=False)