### Data Collection

In [1]:
# Imports
import pandas as pd
import requests
import time

In [2]:
# Five subreddits of interest
subreddit_1 = 'burgers'
subreddit_2 = 'hotdogs'
subreddit_3 = 'Pizza'
subreddit_4 = 'pasta'
subreddit_5 = 'sushi'

In [3]:
# Base url
base_url = 'https://api.pushshift.io/reddit/search/submission'

In [4]:
# Present utc: 3/20/21 at 9:46 PM
present_utc = 1616291202

In [5]:
# Define a function that returns a dataframe with 200 images from a subreddit
def images_200(subreddit, before):
    # Set parameters
    params_100 = {
    'subreddit': subreddit,
    'size': 100,
    'before': present_utc
    }
    
    # Request content
    res_100 = requests.get(base_url, params_100).json()
    posts_100 = res_100['data']
    
    # Last utc
    last_utc = posts_100[-1]['created_utc']
    
    # Create a dataframe with the first 100 images
    df_100 = pd.DataFrame(posts_100)[['subreddit', 'title', 'url']]
    
    # Use a for loop to retrieve 200 images
    for i in range(1):
        params_200 = {
            'subreddit': subreddit,
            'size': 100,
            'before': last_utc
        }
        res_200 = requests.get(base_url, params_200).json()
        posts_200 = res_200['data']
        last_utc = posts_200[-1]['created_utc']
        df_200 = pd.DataFrame(posts_200)[['subreddit', 'title', 'url']]
        df_100 = pd.concat([df_100, df_200])
        time.sleep(1)
    
    # Return a dataframe
    return df_100

In [6]:
# Define a function that returns a dataframe with 200 images for a list of 5 subreddits
subreddit_list = [subreddit_1, subreddit_2, subreddit_3, subreddit_4, subreddit_5]

def subreddit_images(my_list, before):
    for subreddit_name in my_list:
        subreddit_df = images_200(subreddit_name, before)
        if subreddit_name == subreddit_1:
            combined_df = subreddit_df
        else:
            combined_df = pd.concat([combined_df, subreddit_df])
    return combined_df

In [7]:
# Check code execution
df = subreddit_images(subreddit_list, present_utc)

In [8]:
# View dataframe
df

Unnamed: 0,subreddit,title,url
0,burgers,Pimento cheese falafel burger for St. Paddies ...,https://i.redd.it/2fbas0e9gmn61.jpg
1,burgers,Bob Belcher,https://i.redd.it/t9wvrc1u6mn61.png
2,burgers,Airfried FROZEN BURGER PATTIES🍔. Tips and tricks,https://youtu.be/q0gfAKxNex4
3,burgers,Double Smashburger on Homemade Buns,https://i.redd.it/vctmwmk78ln61.jpg
4,burgers,"Double Cheeseburger, Bacon, Pickles, Grilled &...",https://i.redd.it/o4fjy09zokn61.jpg
...,...,...,...
95,sushi,Today's dinner,https://i.redd.it/aqy98t1k5wk61.jpg
96,sushi,Great Sushi Dragon,https://i.redd.it/h5s5j7q2vvk61.png
97,sushi,Fried sushi roll (hosomaki),https://i.redd.it/dqxyj9mztvk61.jpg
98,sushi,How To Make Sushi At Home Without A Rice Cooke...,https://youtu.be/Sk9klgB571M


In [9]:
# Check value counts for the 'subreddit' column
df['subreddit'].value_counts()

burgers    200
Pizza      200
pasta      200
sushi      200
hotdogs    200
Name: subreddit, dtype: int64

In [10]:
# Remove urls that are not .jpg or .png
clean_df = df.loc[df['url'].str.contains('jpg') | df['url'].str.contains('png')]

In [11]:
# Check value counts again
clean_df['subreddit'].value_counts()

burgers    143
hotdogs    125
sushi      123
Pizza       99
pasta       94
Name: subreddit, dtype: int64

In [12]:
# Drop duplicate rows from dataframe
clean_df = clean_df.drop_duplicates()

In [13]:
# Check value counts again
clean_df['subreddit'].value_counts()

burgers    143
hotdogs    125
sushi      123
Pizza       99
pasta       94
Name: subreddit, dtype: int64

**Check:** Each class has 1000 or more images available.

### Load Image Data

Referenced: https://stackoverflow.com/questions/8286352/how-to-save-an-image-locally-using-python-whose-url-address-i-already-know

Referenced: https://www.kite.com/python/answers/how-to-catch-an-httperror-in-python

In [14]:
# Imports
import urllib.request
import sys

In [15]:
# Create a list of urls
url_list = list(clean_df['url'])
url_list[:5]

['https://i.redd.it/2fbas0e9gmn61.jpg',
 'https://i.redd.it/t9wvrc1u6mn61.png',
 'https://i.redd.it/vctmwmk78ln61.jpg',
 'https://i.redd.it/o4fjy09zokn61.jpg',
 'https://i.redd.it/xepjccl7hjn61.jpg']

In [16]:
# Save each url as either a .jpg or .png file
jpg_counter = 1
png_counter = 1
for url in url_list:
    if 'jpg' in url:
        try:
            urllib.request.urlretrieve(url, f'../images/image_{jpg_counter}.jpg')
            jpg_counter += 1
        except urllib.error.HTTPError:
            pass
    elif 'png' in url:
        try:
            urllib.request.urlretrieve(url, f'../images/image_{png_counter}.png')
            png_counter += 1
        except urllib.error.HTTPError:
            pass

In [17]:
# Count the number of images stored
len(url_list)

584

**Progress Update:** All the image data has been successfully collected and stored as .png and .jpg files! Incorrect images still need to be removed.

### Removing Incorrectly Classified Images from the Dataset
Given the source of the images, not all images were classified properly in the dataset. The following images have been removed manually due to blatant misclassification errors:
- image_1.png
- image_57.jpg
- image_2.png
- image_104.jpg
- image_107.jpg
- image_122.jpg
- image_3.png
- image_4.png
- image_147.jpg
- image_150.jpg
- image_153.jpg
- image_154.jpg
- image_165.jpg
- image_5.png
- image_186.jpg
- image_187.jpg
- image_194.jpg
- image_220.jpg
- image_224.jpg
- image_331.jpg
- image_10.png
- image_347.jpg
- image_350.jpg
- image_362.jpg
- image_396.jpg
- image_449.jpg
- image_13.png
- image_466.jpg