In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path

In [4]:
# Import the raw data pickle file and convert it to a dataframe
cwd = Path.cwd()
raw = pickle.load(open(cwd / '..' / 'raw_data' / 'moonGen_scrape_2016_final', 'rb'))
df = pd.DataFrame(raw).T
df = df.drop(['url', 'problem_type', 'is_master', 'setter', 'user_grade'], axis=1) # drop columns that are not needed
df.head()

Unnamed: 0,start,mid,end,grade,is_benchmark,repeats
367894,"[[5, 4], [6, 1]]","[[7, 9], [1, 10], [4, 14], [4, 7]]","[[3, 17]]",6C+,False,0
367892,"[[0, 4], [3, 5]]","[[4, 8], [0, 8], [2, 12], [3, 14]]","[[0, 17]]",6C,False,1
367889,"[[6, 3]]","[[7, 7], [6, 9], [10, 11], [6, 14]]","[[10, 17]]",6C,False,0
367885,"[[0, 4]]","[[1, 7], [3, 11], [5, 13]]","[[8, 17]]",7A,False,1
367880,"[[5, 4], [8, 3]]","[[7, 7], [7, 9], [7, 12], [4, 14]]","[[6, 17]]",6B+,False,1


In [25]:
# We need to convert start, mid, and end lists to their own columns
# loop through the raw data, and create a list of all the unique lists found
hold_type = ["start", "mid", "end"]
unique_holds = []
for row in df[hold_type].iterrows():
    for hold in hold_type:
        for i in row[1][hold]:
            if i not in unique_holds:
                unique_holds.append(i)
unique_holds.sort(key=lambda x: (x[0], x[1]))

In [26]:
print("Sorted unique holds:")
print(unique_holds)

Sorted unique holds:
[[0, 4], [0, 5], [0, 8], [0, 9], [0, 10], [0, 11], [0, 12], [0, 13], [0, 14], [0, 15], [0, 17], [1, 2], [1, 3], [1, 5], [1, 6], [1, 7], [1, 8], [1, 9], [1, 10], [1, 11], [1, 12], [1, 14], [1, 15], [1, 17], [2, 4], [2, 5], [2, 6], [2, 7], [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14], [2, 15], [2, 17], [3, 2], [3, 4], [3, 5], [3, 6], [3, 7], [3, 8], [3, 9], [3, 10], [3, 11], [3, 12], [3, 13], [3, 14], [3, 15], [3, 16], [3, 17], [4, 5], [4, 6], [4, 7], [4, 8], [4, 9], [4, 10], [4, 11], [4, 12], [4, 13], [4, 14], [4, 15], [4, 17], [5, 4], [5, 5], [5, 6], [5, 7], [5, 8], [5, 9], [5, 10], [5, 11], [5, 12], [5, 13], [5, 14], [5, 15], [6, 1], [6, 3], [6, 5], [6, 6], [6, 7], [6, 8], [6, 9], [6, 10], [6, 11], [6, 12], [6, 13], [6, 14], [6, 15], [6, 16], [6, 17], [7, 4], [7, 6], [7, 7], [7, 8], [7, 9], [7, 10], [7, 11], [7, 12], [7, 13], [7, 14], [7, 15], [7, 17], [8, 3], [8, 4], [8, 5], [8, 6], [8, 7], [8, 8], [8, 9], [8, 10], [8, 11], [8, 12], [8, 13], [8, 14

In [27]:
# we need to hash the [x,y] lists to a string, so we can use them as columns
hold_cols = []
for hold in unique_holds:
    hold_cols.append(str(hold[0]) + "_" + str(hold[1]))

In [28]:
# create a new dataframe, with the unique holds as columns using one hot encoding
df_processed = pd.DataFrame(columns=hold_cols + ["grade", "is_benchmark", "repeats"])
# loop through the raw data, and create a new row for each problem
for row in df.iterrows():
    for hold in hold_type:
        for i in row[1][hold]:
            hold_col = str(i[0]) + "_" + str(i[1])
            df_processed.loc[row[0], hold_col] = 1
    df_processed.loc[row[0], "grade"] = row[1]["grade"]
    df_processed.loc[row[0], "is_benchmark"] = row[1]["is_benchmark"]
    df_processed.loc[row[0], "repeats"] = row[1]["repeats"]
# replace NaN values with 0
df_processed = df_processed.fillna(0)
    


In [33]:
# shuffle the dataframe
df_save = df_processed.sample(frac=1).reset_index(drop=True)
# save the dataframe as a pickle file
df_save.to_pickle(cwd / '..' / 'raw_data' / 'moonGen_scrape_2016_final_df')