#### Imports

In [31]:
# Import necessary packages.
from PIL import Image
import pandas as pd
import os
import urllib.request
import math

#### Create DataFrame for Outputs

In [32]:
# Our output file with possible size ratings as headers.
header_names = ["date", "small", "large", "very_large", "historic"]

# Add headers.
df = pd.DataFrame(columns=header_names)

In [33]:
# Our total span, covering multiple seasons.
years = ("17", "18", "19")

In [34]:
for year in years:
    title = "sizes_" + year + ".csv"
    sizes = pd.read_csv(title)

    # 1000 == small
    # 0100 == large
    # 0010 == very large
    # 0001 == historic
    for index, row in sizes.iterrows():
        date = row["date"][40:]

        # As defined by data-source.
        small = row["size_10"]
        large_1 = row["size_12"]
        large_2 = row["size_20"]
        very_large = row["size_23"]

        # Must first confirm that this entry has properly scraped data.
        # One-hot encode size ratings.
        if not pd.isna(very_large) or not pd.isna(large_1) or not pd.isna(large_2) or not pd.isna(small):
            if not pd.isna(very_large):
                entry = [date, 0, 0, 1, 0]
            elif not pd.isna(large_1) or not pd.isna(large_2):
                entry = [date, 0, 1, 0, 0]
            elif not pd.isna(small):
                entry = [date, 1, 0, 0, 0]
            df.loc[len(df)] = entry


#### Sort DataFrame by Date

In [35]:
# Sort by date.
df['date'] = pd.to_datetime(df.date, infer_datetime_format = True)
df.sort_values(by = 'date', ascending = True, inplace = True)

# Turn DataFrame into CSV.
df.to_csv("sizes.csv", index=False)