# Data Collection & Cleaning

### Data Collection

In [1]:
# Imports for data collection
import pandas as pd
import requests
import time

# Imports for loading image data
import urllib.request
import sys
import os

In [2]:
# Subreddits of interest
subreddit_1 = 'burgers'
subreddit_2 = 'hotdogs'
subreddit_3 = 'Pizza'
subreddit_4 = 'tacos'
subreddit_5 = 'sushi'

In [3]:
# Base url
base_url = 'https://api.pushshift.io/reddit/search/submission'

In [4]:
# Present utc: 4/10/21 at 12:04 PM
present_utc = 1618070648

In [5]:
# Define a function that returns a dataframe with n images from a subreddit
def n_images(subreddit, before):
    
    # Set parameters
    params = {
    'subreddit': subreddit,
    'size': 100,
    'before': present_utc
    }
    
    # Request content
    res = requests.get(base_url, params).json()
    posts = res['data']
    
    # Last utc
    last_utc = posts[-1]['created_utc']
    
    # Create a dataframe with the first 100 images
    df_og = pd.DataFrame(posts)[['subreddit', 'title', 'url']]
    
    # Use a for loop to retrieve more images - to specify, update 'range(n)'
    for i in range(5):
        params_n = {
            'subreddit': subreddit,
            'size': 100,
            'before': last_utc
        }
        res_n = requests.get(base_url, params_n).json()
        posts_n = res_n['data']
        last_utc = posts_n[-1]['created_utc']
        df_n = pd.DataFrame(posts_n)[['subreddit', 'title', 'url']]
        df_og = pd.concat([df_og, df_n])
        time.sleep(1)
    
    # Return a dataframe
    return df_og

In [6]:
# List of subreddits
subreddit_list = [subreddit_1, subreddit_2, subreddit_3, subreddit_4, subreddit_5]

# Define a function that returns a dataframe with n images for a list of subreddits
def subreddit_images(my_list, before):
    for subreddit_name in my_list:
        subreddit_df = n_images(subreddit_name, before)
        if subreddit_name == subreddit_1:
            combined_df = subreddit_df
        else:
            combined_df = pd.concat([combined_df, subreddit_df])
    return combined_df

In [7]:
# Create dataframe
df = subreddit_images(subreddit_list, present_utc)

# Check code execution
df

Unnamed: 0,subreddit,title,url
0,burgers,Am I missing something? It happened to me today.,https://i.redd.it/6xduxhnen6s61.jpg
1,burgers,Didn’t have all the ingredients that I would l...,https://i.redd.it/2e1v90z436s61.jpg
2,burgers,"Turkey burgers for mom and toddler, beef for dad",https://i.redd.it/f9iedd5pn1s61.jpg
3,burgers,Smash burger with cheese and onions.,https://i.redd.it/1e3psut8m1s61.jpg
4,burgers,Homemade Shake Shack Double SmokeShack,https://i.imgur.com/tGLbFjS.jpg
...,...,...,...
95,sushi,"Cowboy roll- Shrimp tempura, spicy crab, tuna ...",https://i.redd.it/99ptxel8ipa61.jpg
96,sushi,Tiger Roll wants to tell you something! [Art],https://i.imgur.com/goWiZhY.png
97,sushi,"Hamachi, Toro, chutoro and otoro👌🏼",https://www.reddit.com/gallery/kuu1qw
98,sushi,"Discount sushi, plus a discount NFL Team 👍",https://i.redd.it/sfc476nz2ma61.jpg


In [8]:
# Check value counts for the 'subreddit' column
df['subreddit'].value_counts()

burgers    600
sushi      600
hotdogs    600
Pizza      600
tacos      600
Name: subreddit, dtype: int64

In [9]:
# Remove urls that are not .jpg or .png
clean_df = df.loc[df['url'].str.contains('jpg') | df['url'].str.contains('png')]

In [10]:
# Drop duplicate rows from dataframe
clean_df = clean_df.drop_duplicates()

In [11]:
# Check value counts again
clean_df['subreddit'].value_counts()

burgers    445
hotdogs    386
sushi      355
tacos      349
Pizza      324
Name: subreddit, dtype: int64

### Loading Image Data

In [12]:
# Create a list of urls for each subreddit
burgers_url = list(clean_df.loc[clean_df['subreddit'] == 'burgers']['url'])
hotdogs_url = list(clean_df.loc[clean_df['subreddit'] == 'hotdogs']['url'])
pizza_url = list(clean_df.loc[clean_df['subreddit'] == 'Pizza']['url'])
tacos_url = list(clean_df.loc[clean_df['subreddit'] == 'tacos']['url'])
sushi_url = list(clean_df.loc[clean_df['subreddit'] == 'sushi']['url'])

# Create a list containing the urls for each food class
food_url = [burgers_url, hotdogs_url, pizza_url, tacos_url, sushi_url]

# Create a list of strings describing the food class
food_list = ['burgers', 'hotdogs', 'pizza', 'tacos', 'sushi']

In [13]:
# Referenced: https://stackoverflow.com/questions/1274405/how-to-create-new-folder
# Define a function that makes a new directory for each food class
def new_directory(food_class):
    for each in food_class:
        new_file = f'../images/{each}'
        if not os.path.exists(new_file):
            os.makedirs(new_file)

In [14]:
# Create a new directory for each food class
new_directory(food_list)

In [15]:
# Referenced: https://stackoverflow.com/questions/8286352/how-to-save-an-image-locally-using-python-whose-url-address-i-already-know
# Referenced: https://www.kite.com/python/answers/how-to-catch-an-httperror-in-python
# Define a function that saves each url as either a .jpg or .png file
def url_saver(url_list, food_name):
    jpg_counter = 1
    png_counter = 1
    total_count = 0
    for url in url_list:
        if 'jpg' in url:
            try:
                urllib.request.urlretrieve(url, f'../images/{food_name}/{food_name}_{jpg_counter}.jpg')
                jpg_counter += 1
                total_count += 1
            except urllib.error.HTTPError:
                pass
        elif 'png' in url:
            try:
                urllib.request.urlretrieve(url, f'../images/{food_name}/{food_name}_{png_counter}.png')
                png_counter += 1
                total_count += 1
            except urllib.error.HTTPError:
                pass
    print(f'The total number of images for {food_name} is: {total_count}')

In [16]:
# Save all images by iterating over a for loop
my_index = 0
for food in food_url:
    url_saver(food, food_list[my_index])
    my_index += 1

The total number of images for burgers is: 367
The total number of images for hotdogs is: 341
The total number of images for pizza is: 290
The total number of images for tacos is: 301
The total number of images for sushi is: 297


**Progress Update:** All the image data has been successfully collected and stored as .png and .jpg files! Incorrect images need to be removed manually to guarantee quality.

### Data Cleaning

#### Removing Incorrect Food Class Images
Given the source of the images, not all images displayed the appropriate food class. To verify that the images belonged to the food class, all images were manually inspected, and inappropriate images were manually removed.

### Limitations

Two major limitations in executing this project are explained below.

The first limitation involves data collection. Originally, I had planned on utilizing a web API to scrape image data from Google Images. Due to the uncertainty of the legality of such an approach, I opted to use a less efficient method for collecting image data (i.e. via the pushshift.io Reddit API). Even if the intended approach were legal, I quickly ran into issues with funding and limitations on the number of requested API calls per month since most Google Image search APIs and proxies charge varying monthly fees depending on the plan [(source)](https://www.scraperapi.com/blog/best-google-image-search-apis-and-proxies/).

The second limitation deals with the sample size. My initial goal was to build a CNN utilizing a minimum of 1,000 images per class. The problem with this is that the colored images not only took up a lot of storage on my local device in a way that slowed my processing speed but also proved too difficult to validate proper image classification. Since the subreddits frequently include images that are not actual images of food and require manual verification and removal, I reduced my sample size for each class to at least 200 images per class, which is less than ideal.