### Data Collection

In [1]:
# Imports
import pandas as pd
import requests
import time

In [2]:
# Five subreddits of interest
subreddit_1 = 'burgers'
subreddit_2 = 'hotdogs'
subreddit_3 = 'Pizza'
subreddit_4 = 'pasta'
subreddit_5 = 'sushi'

In [3]:
# Base url
base_url = 'https://api.pushshift.io/reddit/search/submission'

In [4]:
# Present utc: 3/20/21 at 9:46 PM
present_utc = 1616291202

In [5]:
# Define a function that returns a dataframe with 500 images from a subreddit
def images_200(subreddit, before):
    # Set parameters
    params_100 = {
    'subreddit': subreddit,
    'size': 100,
    'before': present_utc
    }
    
    # Request content
    res_100 = requests.get(base_url, params_100).json()
    posts_100 = res_100['data']
    
    # Last utc
    last_utc = posts_100[-1]['created_utc']
    
    # Create a dataframe with the first 100 images
    df_100 = pd.DataFrame(posts_100)[['subreddit', 'title', 'url']]
    
    # Use a for loop to retrieve 200 images
    for i in range(5):
        params_200 = {
            'subreddit': subreddit,
            'size': 100,
            'before': last_utc
        }
        res_200 = requests.get(base_url, params_200).json()
        posts_200 = res_200['data']
        last_utc = posts_200[-1]['created_utc']
        df_200 = pd.DataFrame(posts_200)[['subreddit', 'title', 'url']]
        df_100 = pd.concat([df_100, df_200])
        time.sleep(2)
    
    # Return a dataframe
    return df_100

In [6]:
# Define a function that returns a dataframe with 200 images for a list of 5 subreddits
subreddit_list = [subreddit_1, subreddit_2, subreddit_3, subreddit_4, subreddit_5]

def subreddit_images(my_list, before):
    for subreddit_name in my_list:
        subreddit_df = images_200(subreddit_name, before)
        if subreddit_name == subreddit_1:
            combined_df = subreddit_df
        else:
            combined_df = pd.concat([combined_df, subreddit_df])
    return combined_df

In [7]:
# Check code execution
df = subreddit_images(subreddit_list, present_utc)

In [8]:
# View dataframe
df

Unnamed: 0,subreddit,title,url
0,burgers,Pimento cheese falafel burger for St. Paddies ...,https://i.redd.it/2fbas0e9gmn61.jpg
1,burgers,Bob Belcher,https://i.redd.it/t9wvrc1u6mn61.png
2,burgers,Airfried FROZEN BURGER PATTIES🍔. Tips and tricks,https://youtu.be/q0gfAKxNex4
3,burgers,Double Smashburger on Homemade Buns,https://i.redd.it/vctmwmk78ln61.jpg
4,burgers,"Double Cheeseburger, Bacon, Pickles, Grilled &...",https://i.redd.it/o4fjy09zokn61.jpg
...,...,...,...
95,sushi,🔥 Is flaming sushi a thing where you’re at? 🔥 ...,https://www.reddit.com/gallery/lcc9ko
96,sushi,These Soy Sauce Dishes Bring an Image to Life ...,https://puloh.com/blog/these-soy-sauce-dishes-...
97,sushi,Anyone craving for sushi today?,https://i.redd.it/ip4y94ckeef61.jpg
98,sushi,Does anyone know how to cure ikura (or any fis...,https://www.reddit.com/r/sushi/comments/lc7olk...


In [9]:
# Check value counts for the 'subreddit' column
df['subreddit'].value_counts()

burgers    600
hotdogs    600
Pizza      600
pasta      600
sushi      600
Name: subreddit, dtype: int64

In [10]:
# Remove urls that are not .jpg or .png
clean_df = df.loc[df['url'].str.contains('jpg') | df['url'].str.contains('png')]

In [11]:
# Check value counts again
clean_df['subreddit'].value_counts()

burgers    468
hotdogs    380
sushi      351
Pizza      312
pasta      272
Name: subreddit, dtype: int64

In [12]:
# Drop duplicate rows from dataframe
clean_df = clean_df.drop_duplicates()

In [13]:
# Check value counts again
clean_df['subreddit'].value_counts()

burgers    468
hotdogs    380
sushi      351
Pizza      311
pasta      272
Name: subreddit, dtype: int64

**Check:** Each class has 1000 or more images available.

### Load Image Data

Referenced: https://stackoverflow.com/questions/8286352/how-to-save-an-image-locally-using-python-whose-url-address-i-already-know

Referenced: https://www.kite.com/python/answers/how-to-catch-an-httperror-in-python

In [14]:
# Imports
import urllib.request
import sys

In [15]:
# Create a list of urls for each subreddit
burgers_url = list(clean_df.loc[clean_df['subreddit'] == 'burgers']['url'])
hotdogs_url = list(clean_df.loc[clean_df['subreddit'] == 'hotdogs']['url'])
pizza_url = list(clean_df.loc[clean_df['subreddit'] == 'Pizza']['url'])
pasta_url = list(clean_df.loc[clean_df['subreddit'] == 'pasta']['url'])
sushi_url = list(clean_df.loc[clean_df['subreddit'] == 'sushi']['url'])

# Create a list containing the urls for each food class
food_url = [burgers_url, hotdogs_url, pizza_url, pasta_url, sushi_url]

# Create a list of strings describing the food class
food_list = ['burgers', 'hotdogs', 'pizza', 'pasta', 'sushi']

In [16]:
# Define a function that saves each url as either a .jpg or .png file
def url_saver(url_list, food_name):
    jpg_counter = 1
    png_counter = 1
    total_count = 0
    for url in url_list:
        if 'jpg' in url:
            try:
                urllib.request.urlretrieve(url, f'../images/{food_name}/{food_name}_{jpg_counter}.jpg')
                jpg_counter += 1
                total_count += 1
            except urllib.error.HTTPError:
                pass
        elif 'png' in url:
            try:
                urllib.request.urlretrieve(url, f'../images/{food_name}/{food_name}_{png_counter}.png')
                png_counter += 1
                total_count += 1
            except urllib.error.HTTPError:
                pass
    print(f'The total number of images for {food_name} is: {total_count}')

In [17]:
# Save all images by iterating over a for loop
my_index = 0
for food in food_url:
    url_saver(food, food_list[my_index])
    my_index += 1

The total number of images for burgers is: 370
The total number of images for hotdogs is: 332
The total number of images for pizza is: 278
The total number of images for pasta is: 231
The total number of images for sushi is: 291


**Progress Update:** All the image data has been successfully collected and stored as .png and .jpg files! Incorrect images still need to be removed.

### Removing Incorrectly Classified Images from the Dataset
Given the source of the images, not all images were classified properly in the dataset. The following images have been removed manually due to blatant misclassification errors:
- Burgers: burgers_1.png, burgers_2.png, burgers_104.jpg, burgers_107.jpg, burgers_122.jpg
- Hot dogs: hotdogs_27.jpg, hotdogs_30.jpg, hotdogs_31.jpg, hotdogs_42.jpg, hotdogs_66.jpg, hotdogs_71.jpg, hotdogs_88.jpg
- Pizza: pizza_1.jpg
- Pasta: pasta_3.png, pasta_23.jpg, pasta_39.jpg, pasta_54.jpg
- Sushi: sushi_14.jpg, sushi_3.png, sushi_84.jpg