# World Rubik's Cube Association Data Analysis 

<center>
    <img 
         src='https://upload.wikimedia.org/wikipedia/commons/thumb/8/85/World_Cube_Association.svg/800px-World_Cube_Association.svg.png'
         style="width: 20%;"
         ></center>

Data available [here](https://www.worldcubeassociation.org/results/misc/export.html).

## Imports and Pre-Settings 

In [19]:
import glob
import collections

import numpy as np
import pandas as pd

## File reading 

In [2]:
files = glob.glob("data/*.tsv")

In [3]:
dfs = {}
for file in files:
    file_name = file.split('.tsv')[0].split('_')[-1].lower()
    dfs[file_name] = pd.read_csv(file, sep='\t', low_memory=False) 

In [4]:
formats = dfs['formats']
championships = dfs['championships']
scrambles = dfs['scrambles']
competitions = dfs['competitions']
championship = dfs['championship']
countries = dfs['countries']
roundtypes = dfs['roundtypes']
events = dfs['events']
results = dfs['results']
continents = dfs['continents']
ranksaverage = dfs['ranksaverage']
persons = dfs['persons']
rankssingle = dfs['rankssingle']
rounds = dfs['rounds']

## Checking out dataframes 

In [5]:
for k, v in dfs.items():
    print(f'{k.title()}:')
    print(' Shape:', v.shape)
    print(' Columns', list(v.columns))
    print('-' * 100)

Formats:
 Shape: (5, 7)
 Columns ['id', 'name', 'sort_by', 'sort_by_second', 'expected_solve_count', 'trim_fastest_n', 'trim_slowest_n']
----------------------------------------------------------------------------------------------------
Championships:
 Shape: (453, 3)
 Columns ['id', 'competition_id', 'championship_type']
----------------------------------------------------------------------------------------------------
Scrambles:
 Shape: (1100059, 8)
 Columns ['scrambleId', 'competitionId', 'eventId', 'roundTypeId', 'groupId', 'isExtra', 'scrambleNum', 'scramble']
----------------------------------------------------------------------------------------------------
Competitions:
 Shape: (6696, 20)
 Columns ['id', 'name', 'cityName', 'countryId', 'information', 'year', 'month', 'day', 'endMonth', 'endDay', 'eventSpecs', 'wcaDelegate', 'organiser', 'venue', 'venueAddress', 'venueDetails', 'external_website', 'cellName', 'latitude', 'longitude']
------------------------------------------

#### Merging Persons Infos

In [6]:
rankssingle.columns = [col + '_single' for col in rankssingle.columns]
ranksaverage.columns = [col + '_average' for col in ranksaverage.columns]

persons = persons.merge(rankssingle, left_on='id', right_on='personId_single', how='outer')
persons = persons.merge(ranksaverage, left_on='id', right_on='personId_average')

persons.drop(['personId_single', 'personId_average'], axis=1, inplace=True)

In [50]:
scrambles.loc[:, 'scramble_sequence'] = scrambles.scramble.apply(lambda x: np.array(('(start) ' + x + ' (end)').split(' '), dtype=str))

In [51]:
scrambles.head()

Unnamed: 0,scrambleId,competitionId,eventId,roundTypeId,groupId,isExtra,scrambleNum,scramble,scramble_sequence
0,1,GaleriesDorianOpen2014,pyram,1,A,0,1,U R' L' B U B' R' B' L' U L' u' r' b',"[(start), U, R', L', B, U, B', R', B', L', U, ..."
1,2,GaleriesDorianOpen2014,pyram,1,A,0,2,B' L' U' B U' L U' R B' R' L' u r,"[(start), B', L', U', B, U', L, U', R, B', R',..."
2,3,GaleriesDorianOpen2014,pyram,1,A,0,3,R' U R' L' B' U L' B' R L' U l',"[(start), R', U, R', L', B', U, L', B', R, L',..."
3,4,GaleriesDorianOpen2014,pyram,1,A,0,4,L R' L U B R L R B R L' u' l,"[(start), L, R', L, U, B, R, L, R, B, R, L', u..."
4,5,GaleriesDorianOpen2014,pyram,1,A,0,5,B' U R L' R B L' U' B' R' U' l' r',"[(start), B', U, R, L', R, B, L', U', B', R', ..."


In [89]:
scrambles[scrambles.index == 1061780].scramble.apply(lambda x: x.find('2'))

1061780    4
Name: scramble, dtype: int64

In [8]:
def group_states(x):
    return [v+'>'+x[k+1] for k, v in enumerate(x[:-1])]

vec_group_states = np.vectorize(group_states, otypes=[list])

scrambles.loc[:, 'scramble_sequence'] = scrambles.scramble_sequence.apply(group_states)

In [48]:
transitions = [item for sublist in scrambles.scramble_sequence for item in sublist]
transition_rates = dict(collections.Counter(transitions))

total_transitions = sum(transition_rates.values())

transition_rates = {k: v/total_transitions for k, v in transition_rates.items()}
transition_rates = sorted(transition_rates.items(), key=lambda kv: kv[1])[::-1]

In [49]:
transition_rates

[('R-->D--', 0.0118665353255244),
 ('R++>D++', 0.011837115801532919),
 ('R-->D++', 0.011828940270655286),
 ('R++>D--', 0.011810731133700557),
 ('D-->R--', 0.009475842870136555),
 ('D++>R--', 0.009472219623497603),
 ('D-->R++', 0.009468998959818535),
 ('D++>R++', 0.009459677615901234),
 ("R>U'", 0.0049987487411927625),
 ('D++>U', 0.004734158832789367),
 ("D-->U'", 0.004732393661349877),
 ("U'>R", 0.004491834858859524),
 ("U'>R'", 0.004488707098940429),
 ("R'>U'", 0.004398652387606502),
 ('U2>R2', 0.004149515471278632),
 ('R2>U2', 0.004099192601293202),
 ("U>R'", 0.004028957358753536),
 ("U'>R2", 0.003950701424936193),
 ("R2>U'", 0.003914066375586799),
 ('R>U', 0.0037445479815558787),
 ('R>U2', 0.0037140136124447186),
 ("U2>R'", 0.0037121245693252656),
 ('U>R', 0.0035045466016253636),
 ("R'>U", 0.0034429204408432056),
 ("R'>U2", 0.0034268480903678587),
 ('U2>R', 0.0033950130686170757),
 ('F2>R2', 0.0032957608850458114),
 ('R2>F2', 0.003274485924011971),
 ('U2>F', 0.0032557503324173952),


In [7]:
championships.head()

Unnamed: 0,id,competition_id,championship_type
0,328,ACARO2016,HK
1,170,AFSwedishCubeOpen2016,SE
2,326,AlbanianOpen2018,AL
3,414,AlbanianOpen2019,AL
4,296,Andorra2017,AD


In [12]:
competitions.head(1)

Unnamed: 0,id,name,cityName,countryId,information,year,month,day,endMonth,endDay,eventSpecs,wcaDelegate,organiser,venue,venueAddress,venueDetails,external_website,cellName,latitude,longitude
0,100Merito2018,100º Mérito 2018,"Santarém, Pará",Brazil,Qualquer pessoa que resolva o Cubo Mágico em m...,2018,4,14,4,14,222 333 333bf 333ft 444 pyram skewb,[{Rafael de Andrade Cinoto}{mailto:rcinoto@wor...,[{Davi de Andrade Iácono}{mailto:cuber.paragom...,[Mérito Pré-Vestibulares](https://www.facebook...,"Avenida Mendonça Furtado, 1120 - CEP: 68040050",Próximo ao Colégio São Francisco,,100º Mérito 2018,-2422498,-54712597


In [8]:
roundtypes.head(1)

Unnamed: 0,id,rank,name,cellName,final
0,0,19,Qualification round,Qualification,0


In [9]:
events.head(1)

Unnamed: 0,id,name,rank,format,cellName
0,222,2x2x2 Cube,20,time,2x2x2 Cube


In [10]:
results.head(1)

Unnamed: 0,competitionId,eventId,roundTypeId,pos,best,average,personName,personId,personCountryId,formatId,value1,value2,value3,value4,value5,regionalSingleRecord,regionalAverageRecord
0,LyonOpen2007,333,1,15,1968,2128,Etienne Amany,2007AMAN01,Cote d_Ivoire,a,1968,2203,2138,2139,2108,AfR,AfR


In [11]:
continents.head(1)

Unnamed: 0,id,name,recordName,latitude,longitude,zoom
0,_Africa,Africa,AfR,213671,16984850,3
