# **Analyzing WCA Competitors' Relative Performances in Big Cube Events**

Chanoe Andrew Park, Nate Ellis

***

## **Introduction**

TODO
- WCA?
- Big cubes?

***

## **Data collection/curation + parsing**

In [1]:
import os
import requests
import shutil
import zipfile

data_dir = './WCA_data'
data_zip = f'{data_dir}.zip'

if os.path.isdir(data_dir):
    # Delete the existing data
    shutil.rmtree(data_dir)

if os.path.exists(data_zip):
    # Delete the zipped data
    os.remove(data_zip)

# Get data from WCA
url = 'https://www.worldcubeassociation.org/results/misc/WCA_export.tsv.zip'
r = requests.get(url, stream=True)

with open(data_zip, 'wb') as f:
    f.write(r.content)
            
# Extract zipped data
with zipfile.ZipFile(data_zip, 'r') as zf:
    zf.extractall(data_dir)

In [34]:
import numpy as np
import pandas as pd

df_avg = pd.read_csv(f'{data_dir}/WCA_export_RanksAverage.tsv', sep='\t', low_memory=False)

# Drop all events that aren't the big cube events
df_avg.drop(df_avg[(df_avg.eventId < '555') | (df_avg.eventId > '777')].index, inplace=True)

# Only keep competitors with averages in all big cube events
tmp = pd.merge(df_avg, df_avg, on='personId', suffixes=('_5', '_6'))
tmp.drop(tmp[(tmp.eventId_5 != '555') | (tmp.eventId_6 != '666')].index, inplace=True)
tmp = pd.merge(tmp, df_avg, on='personId')
df_grouped = tmp.drop(tmp[(tmp.eventId_5 != '555') | (tmp.eventId_6 != '666') | (tmp.eventId != '777')].index)
df_grouped.reset_index(drop=True, inplace=True)

# Drop eventId columns as not needed anymore
df_grouped.drop(['eventId_5', 'eventId_6', 'eventId'], axis = 1, inplace=True)

# Rename columns relating to 7x7
df_grouped.columns = [col if i < 9 else f'{col}_7' for (i, col) in enumerate(df_grouped.columns)]

# Fix time scale
df_grouped['best_5'] = np.float64(df_grouped['best_5'] / 100.0)
df_grouped['best_6'] = np.float64(df_grouped['best_6'] / 100.0)
df_grouped['best_7'] = np.float64(df_grouped['best_7'] / 100.0)

df_grouped

Unnamed: 0,personId,best_5,worldRank_5,continentRank_5,countryRank_5,best_6,worldRank_6,continentRank_6,countryRank_6,best_7,worldRank_7,continentRank_7,countryRank_7
0,2012PARK03,38.45,1,1,1,75.63,1,1,1,106.57,1,1,1
1,2016KOLA02,39.79,2,1,1,80.36,3,2,1,122.46,5,2,1
2,2009ZEMD01,42.09,3,1,1,81.90,5,1,1,120.63,4,1,1
3,2012BEAH01,42.95,4,2,1,81.63,4,3,1,119.92,3,1,1
4,2013NAHM01,44.78,5,1,1,86.96,7,1,1,125.08,6,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4261,2013HERM02,231.48,14091,4126,26,634.00,6117,1895,15,738.00,4837,1488,12
4262,2005GUST02,235.04,14117,4135,162,626.00,6116,1894,81,1031.00,4851,1496,69
4263,2011CHEC01,250.74,14254,4177,536,500.91,6090,1886,231,730.00,4835,1486,150
4264,2017DAOU01,267.65,14336,3614,404,463.11,6070,1579,193,722.00,4831,1233,137


***
## **Data management/representation**

***
## **Exploratory data analysis**

***
## **Hypothesis testing**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_results = pd.read_csv("./WCAData/WCA_export_RanksAverage.tsv", sep='\t', low_memory=False)
df_results.head(5)

In [None]:
list = ['333', '444', '555', '666', '777']
df_trimmed = pd.DataFrame()
for item in list:
    temp_df = df_results[(df_results.eventId.str[0:] == item)]
    temp_df.drop(temp_df.iloc[3000:,:].index, inplace=True)
        temp_df.sort_values(by='personId')
    df_trimmed = pd.concat([df_trimmed,temp_df])
print(len(df_trimmed))
df_trimmed.head()

In [None]:
for person in df_trimmed.personId.unique():
    df_person = df_trimmed.loc[df_trimmed['personId'] == person]
    if len(df_person) != 5:
        df_trimmed.drop(df_person.index, inplace=True)
print(len(df_trimmed))

In [None]:
x3_df = df_trimmed.loc[df_trimmed['eventId'] == '333']
x4_df = df_trimmed.loc[df_trimmed['eventId'] == '444']
x5_df = df_trimmed.loc[df_trimmed['eventId'] == '555']
x6_df = df_trimmed.loc[df_trimmed['eventId'] == '666']
x7_df = df_trimmed.loc[df_trimmed['eventId'] == '777']

large_cube_df = pd.DataFrame()
large_cube_df['444'] = x4_df['best']
large_cube_df['555'] = x5_df['best']
print(large_cube_df)
print(x4_df)
print(x5_df)
print(len(df_trimmed))

***
## **Conclusion (Communication of insights attained)**