# Clean Tour de France stage data from 2012 to 2016  
This notebook imports stage data scraped from [letour.com](http://www.letour.com) for the years 2012 to 2016, cleans it, and prepares it to be joined with data from 2008 to 2011.

In [1]:
import pickle
import pandas as pd
import pprint as pp
import numpy as np

### Cleaning Module ###

In [2]:
def clean_stage(raw_dict):
    stage = raw_dict['stage']
    stage = stage.strip()
    if stage == 'prologue':
        stage = '0'
    
    return int(stage)


def clean_year(raw_dict):
    year = raw_dict['year']
    year = year.strip()
    
    return int(year)


def clean_distance(raw_dict):
    distance = raw_dict['distance']
    distance = distance.strip()
    distance = distance.strip('km')
    
    return float(distance)


def clean_start_finish(raw_dict):
    start_finish = raw_dict['start_finish']
    start_finish = start_finish.split('/')
    start = start_finish[0].rstrip()
    start = start.lstrip()
    finish = start_finish[1].rstrip()
    finish = finish.lstrip()
    
    return (start,finish)


def clean_date(raw_dict):
    date_stage = raw_dict['date_stage']
    date_stage = date_stage.split('-')
    date = date_stage[0].rstrip()
    date = date.lstrip()
    date = date.split()
    week_day = date[0].strip(',')
    month = date[1]
    day = date[2].strip('ND')
    day = day.strip('ST')
    day = day.strip('TH')
    day = day.strip('RD')
    
    return (week_day, month, day)


def clean_winning_time(raw_dict):
    try:
        winning_time = raw_dict['winning_time']
        winning_time = winning_time.split()
        
        if 'h' in winning_time[0]:
            hours = winning_time[0].strip('h')
            hours = int(hours)
            minutes = winning_time[1].strip("'")
            minutes = int(minutes)
            seconds = winning_time[2].strip("''")
            seconds = int(seconds)
        else:
            hours = 0
            minutes = winning_time[0].strip("'")
            minutes = int(minutes)
            seconds = winning_time[1].strip("''")
            seconds = int(seconds)

        total_minutes = hours*60.0 + minutes + seconds/60.0
    except:
        total_minutes = np.nan
    
    return total_minutes


def clean_climbs(raw_dict):
    climbs = raw_dict['climbs']
    climbs = climbs.split('\n')
    climbs = climbs[8:]

    cleaned_climbs = {}

    index = 0

    for climb in climbs[::2]:
        cleaned_climbs[index] = {}
        climb = climb.split('-')
        start = climb[0].strip('Km')
        start = start.strip()
        start = float(start)
        cleaned_climbs[index]['start'] = start
        index += 1

    index = 0

    for climb in climbs[1::2]:
        climb = climb.split('-')
        
        try:
            length = climb[0].split()[0]
            length = float(length)
            gradient = climb[1].split()[-1]
            gradient = gradient.strip('%')
            gradient = float(gradient)
            category = climb[2].split()[-1]
            cleaned_climbs[index]['length'] = length
            cleaned_climbs[index]['gradient'] = gradient
            cleaned_climbs[index]['category'] = category
            index += 1
        except:
            pass
    
    return cleaned_climbs


def clean_sprints(raw_dict):
    sprints = raw_dict['sprints']
    sprints = sprints.split('\n')
    sprints = sprints[8:]
    
    cleaned_sprints = {}

    index = 0

    for sprint in sprints:
        cleaned_sprints[index] = {}
        sprint = sprint.split('-')
        start = sprint[0].strip('Km')
        start = start.strip()
        start = float(start)
        cleaned_sprints[index]['start'] = start
        index += 1

    return cleaned_sprints


def build_clean_dict(raw_dict):
    clean_dict = {}
    clean_dict['year'] = clean_year(raw_dict)
    clean_dict['stage'] = clean_stage(raw_dict)
    clean_dict['week_day'] = clean_date(raw_dict)[0]
    clean_dict['month'] = clean_date(raw_dict)[1]
    clean_dict['day'] = clean_date(raw_dict)[2]
    clean_dict['start'] = clean_start_finish(raw_dict)[0]
    clean_dict['finish'] = clean_start_finish(raw_dict)[1]
    clean_dict['distance'] = clean_distance(raw_dict)
    clean_dict['winning_time'] = clean_winning_time(raw_dict)
    clean_dict['climbs'] = clean_climbs(raw_dict)
    clean_dict['sprints'] = clean_sprints(raw_dict)
    
    return clean_dict


def build_year_df(raw_dict):
    year_dict = {k: build_clean_dict(v) for k,v in raw_dict.items()}
        
    return pd.DataFrame.from_dict(year_dict, orient='index')


def add_winning_times(in_times, df):
    out_times = []

    for time in in_times:
        try:
            winning_time = time
            winning_time = winning_time.split()
            
            if 'h' in winning_time[0]:
                hours = winning_time[0].strip('h')
                hours = int(hours)
                minutes = winning_time[1].strip("'")
                minutes = int(minutes)
                seconds = winning_time[2].strip("''")
                seconds = int(seconds)
            else:
                hours = 0
                minutes = winning_time[0].strip("'")
                minutes = int(minutes)
                seconds = winning_time[1].strip("''")
                seconds = int(seconds)

            total_minutes = hours*60.0 + minutes + seconds/60.0
        except:
            total_minutes = np.nan

        out_times.append(total_minutes)
        
    df.winning_time = pd.Series(out_times)
    
    
def break_out_climbs(df):
    for i, entry in enumerate(df.climbs):
        
        for k,v in entry.items():
            climb_name = 'climb_{}'.format(k)
            cols = ['start','category','gradient','length']
            
            for col in cols:
                try:
                    value = v[col]
                except:
                    value = None

                col_name = climb_name + '_' + col
                df.set_value(i,col_name,value)
    
    return df


def break_out_sprints(df):
    for i, entry in enumerate(df.sprints):
        
        for k,v in entry.items():
            sprint_name = 'sprint_{}'.format(k)
            cols = ['start']
            
            for col in cols:
                try:
                    value = v[col]
                except:
                    value = None

                col_name = sprint_name + '_' + col
                df.set_value(i,col_name,value)
    
    return df


def break_out_df(df):
    break_out_climbs(df)
    break_out_sprints(df)
    
    return df

### Load in raw data, clean, and pickle ###

In [3]:
# load in raw data
raw_2016_scrape = pickle.load(open("raw_2016_scrape.p","rb"))
raw_2015_scrape = pickle.load(open("raw_2015_scrape.p","rb"))
raw_2014_scrape = pickle.load(open("raw_2014_scrape.p","rb"))
raw_2013_scrape = pickle.load(open("raw_2013_scrape.p","rb"))
raw_2012_scrape = pickle.load(open("raw_2012_scrape.p","rb"))

In [4]:
# pickle first-pass cleaned data
pickle.dump(build_year_df(raw_2016_scrape), open('clean_tdf2016.p','wb'))
pickle.dump(build_year_df(raw_2015_scrape), open('clean_tdf2015.p','wb'))
pickle.dump(build_year_df(raw_2014_scrape), open('clean_tdf2014.p','wb'))
pickle.dump(build_year_df(raw_2013_scrape), open('clean_tdf2013.p','wb'))
pickle.dump(build_year_df(raw_2012_scrape), open('clean_tdf2012.p','wb'))

In [5]:
# reload cleaned data to continue working with it
tdf2016 = pickle.load(open("clean_tdf2016.p","rb"))
tdf2015 = pickle.load(open("clean_tdf2015.p","rb"))
tdf2014 = pickle.load(open("clean_tdf2014.p","rb"))
tdf2013 = pickle.load(open("clean_tdf2013.p","rb"))
tdf2012 = pickle.load(open("clean_tdf2012.p","rb"))

In [6]:
# manually add missing winning times for 2015
in_times_2015 = ["14' 56''",
               "3h 29' 03''",
               "3h 26' 54''",
               "5h 28' 58''",
               "4h 39' 00''",
               "4h 53' 46''",
               "4h 27' 25''",
               "4h 20' 55''",
               "32' 15''",
               "4h 22 '07''",
               "5h 02' 01''",
               "5h 40' 14''",
               "4h 43' 42''",
               "4h 23' 43''",
               "3h 56' 35''",
               "4h 30' 10''",
               "4h 12' 17''",
               "5h 03' 40''",
               "4h 22' 53''",
               "3h 17' 21''",
               "2h 49' 41''"]

add_winning_times(in_times_2015, tdf2015)

In [7]:
# manually add missing winning times for 2014
in_times_2014 = ["4h 44' 07''",
               "5h 08' 36''",
               "3h 38' 30''",
               "3h 36' 39''",
               "3h 18' 35''",
               "4h 11' 39''",
               "5h 18' 39''",
               "3h 49' 28''",
               "4h 09' 34''",
               "4h 27' 26''",
               "4h 25' 45''",
               "4h 32' 11''",
               "5h 12' 29''",
               "5h 08' 27''",
               "4h 56' 43''",
               "6h 07' 10''",
               "3h 35' 23''",
               "4h 04' 17''",
               "4h 43' 41''",
               "1h 06' 21''",
               "3h 20' 50''"]

add_winning_times(in_times_2014, tdf2014)

In [8]:
# manually add missing winning times for 2013
tdf2013.set_value(14,'winning_time',348.75);
tdf2013.set_value(19,'winning_time',219.066666667);
tdf2013.set_value(3,'winning_time',221.4);

In [9]:
# break out the climb and sprint dictionaries into separate columns, and pickle new dataframes
pickle.dump(break_out_df(tdf2016), open('clean_tdf2016_full.p','wb'))
pickle.dump(break_out_df(tdf2015), open('clean_tdf2015_full.p','wb'))
pickle.dump(break_out_df(tdf2014), open('clean_tdf2014_full.p','wb'))
pickle.dump(break_out_df(tdf2013), open('clean_tdf2013_full.p','wb'))
pickle.dump(break_out_df(tdf2012), open('clean_tdf2012_full.p','wb'))

In [10]:
# reload pickled dataframes to continue working with them
tdf2016 = pickle.load(open("clean_tdf2016_full.p","rb"))
tdf2015 = pickle.load(open("clean_tdf2015_full.p","rb"))
tdf2014 = pickle.load(open("clean_tdf2014_full.p","rb"))
tdf2013 = pickle.load(open("clean_tdf2013_full.p","rb"))
tdf2012 = pickle.load(open("clean_tdf2012_full.p","rb"))

In [11]:
# join separate dataframes into single dataframe for 2012 to 2016 data
tdf1216 = tdf2016.append([tdf2015,tdf2014,tdf2013,tdf2012],ignore_index=True).reset_index()
tdf1216.drop('index',inplace=True,axis=1)

In [12]:
tdf1216.head()

Unnamed: 0,climb_0_category,climb_0_gradient,climb_0_length,climb_0_start,climb_1_category,climb_1_gradient,climb_1_length,climb_1_start,climb_2_category,climb_2_gradient,...,finish,month,sprint_0_start,sprint_1_start,sprints,stage,start,week_day,winning_time,year
0,4,5.7,1.2,20.5,4.0,4.8,1.3,39.0,,,...,Utah Beach Sainte-Marie-du-Mont,JULY,118.5,,{0: {'start': 118.5}},1,Mont-Saint-Michel,SATURDAY,254.083333,2016
1,4,5.7,1.4,10.0,4.0,5.0,1.9,23.0,4.0,5.9,...,Cherbourg-en-Cotentin,JULY,107.5,,{0: {'start': 107.5}},2,Saint-Lô,SUNDAY,260.85,2016
2,4,4.4,1.5,25.5,,,,,,,...,Angers,JULY,171.0,,{0: {'start': 171.0}},3,Granville,MONDAY,359.9,2016
3,4,5.6,1.2,182.0,,,,,,,...,Limoges,JULY,170.0,,{0: {'start': 170.0}},4,Saumur,TUESDAY,328.5,2016
4,4,5.2,1.7,16.5,3.0,3.9,6.8,142.5,3.0,3.0,...,Le Lioran,JULY,144.5,,{0: {'start': 144.5}},5,Limoges,WEDNESDAY,331.6,2016


In [13]:
# replace nan's with zeros
cols = ['climb_0_category', 'climb_0_gradient', 'climb_0_length',
       'climb_0_start', 'climb_1_category', 'climb_1_gradient',
       'climb_1_length', 'climb_1_start', 'climb_2_category',
       'climb_2_gradient', 'climb_2_length', 'climb_2_start',
       'climb_3_category', 'climb_3_gradient', 'climb_3_length',
       'climb_3_start', 'climb_4_category', 'climb_4_gradient',
       'climb_4_length', 'climb_4_start', 'climb_5_category',
       'climb_5_gradient', 'climb_5_length', 'climb_5_start',
       'climb_6_category', 'climb_6_gradient', 'climb_6_length',
       'climb_6_start', 'climb_7_category', 'climb_7_gradient',
       'climb_7_length', 'climb_7_start', 'climb_8_category',
       'climb_8_gradient', 'climb_8_length', 'climb_8_start',
        'sprint_0_start','sprint_1_start']

for col in cols:
    tdf1216[col] = tdf1216[col].replace(np.nan,0)

In [14]:
# calculate number of climbs and sprints on each stage and store as new column
tdf1216['num_climbs'] = tdf1216.climbs.apply(lambda x: len(x))
tdf1216['num_sprints'] = tdf1216.sprints.apply(lambda x: len(x))

In [15]:
# recode categories so that H (Hors catégorie) climbs are 5 and the rest are inverted so that a larger number 
# represents a more difficult climb
cols = ['climb_0_category',
       'climb_1_category',
       'climb_2_category',
       'climb_3_category',
        'climb_4_category',
       'climb_5_category',
       'climb_6_category',
       'climb_7_category',
       'climb_8_category']

for i in tdf1216.index:
    num_climbs = len(tdf1216.climbs[i])
    
    if num_climbs == 0:
        pass
    else:
        for col in cols[:int(num_climbs)+1]:
            cat = tdf1216[col][i]
            if cat == 'H':
                cat = 5
            elif cat == '4':
                cat = 1
            elif cat == '3':
                cat = 2
            elif cat == '2':
                cat = 3
            elif cat == '1':
                cat = 4
            else:
                cat = 0
                
            tdf1216.set_value(i,col,cat)
            
        for col in cols[int(num_climbs):]:
            cat = 0
            tdf1216.set_value(i,col,cat)
            

for col in cols:
    tdf1216[col] = tdf1216[col].apply(lambda x: int(x))

In [16]:
# calculate averages for each stage's climb data and store as new columns
def climb_averages(df,features=['category','length','start','gradient']):
    climb_labels = ['climb_0_',
                    'climb_1_',
                    'climb_2_',
                    'climb_3_',
                    'climb_4_',
                    'climb_5_',
                    'climb_6_',
                    'climb_7_',
                    'climb_8_']
    
    for feature in features:
        
        for i in df.index:
            values = [df.loc[i,label+feature] for label in climb_labels]
            num_climbs = df.loc[i,'num_climbs']
            
            if num_climbs > 0:
                avg = sum(values)/float(num_climbs)
            else:
                avg = 0.0
            
            df.set_value(i,feature+'_avg',avg)
     
    return df

tdf1216 = climb_averages(tdf1216)

In [17]:
# average starting point for each stage's sprints and store as new column
cols = ['sprint_0_start',
       'sprint_1_start']

for i in tdf1216.index:
    starts = [tdf1216.loc[i,col] for col in cols]
    num_starts= tdf1216.loc[i,'num_sprints']
    if num_starts > 0:
        avg_start = sum(starts)/float(num_starts)
    else:
        avg_start = 0.0
    tdf1216.set_value(i,'sprint_start_avg',avg_start)

In [18]:
# drop extraneous data
tdf1216.drop(['climbs','sprints','climb_0_start', 'climb_0_category',
       'climb_0_gradient', 'climb_0_length', 'climb_1_start',
       'climb_1_category', 'climb_1_gradient', 'climb_1_length',
       'climb_2_start', 'climb_2_category', 'climb_2_gradient',
       'climb_2_length', 'climb_3_start', 'climb_3_category',
       'climb_3_gradient', 'climb_3_length', 'climb_4_start',
       'climb_4_category', 'climb_4_gradient', 'climb_4_length',
       'climb_5_start', 'climb_5_category', 'climb_5_gradient',
       'climb_5_length', 'climb_6_start', 'climb_6_category',
       'climb_6_gradient', 'climb_6_length', 'climb_7_start',
       'climb_7_category', 'climb_7_gradient', 'climb_7_length','climb_8_start',
       'climb_8_category', 'climb_8_gradient', 'climb_8_length',
       'sprint_0_start', 'sprint_1_start'],axis=1,inplace=True)

In [19]:
# reformat column datatypes
tdf1216['year'] = tdf1216.year.apply(lambda x: str(x));
tdf1216['stage'] = tdf1216.stage.apply(lambda x: str(x));
tdf1216['num_sprints'] = tdf1216.num_sprints.apply(lambda x: float(x));
tdf1216['num_climbs'] = tdf1216.num_climbs.apply(lambda x: float(x));

In [20]:
tdf1216.head()

Unnamed: 0,day,distance,finish,month,stage,start,week_day,winning_time,year,num_climbs,num_sprints,category_avg,length_avg,start_avg,gradient_avg,sprint_start_avg
0,2,188.0,Utah Beach Sainte-Marie-du-Mont,JULY,1,Mont-Saint-Michel,SATURDAY,254.083333,2016,2.0,1.0,1.0,1.25,29.75,5.25,118.5
1,3,183.0,Cherbourg-en-Cotentin,JULY,2,Saint-Lô,SUNDAY,260.85,2016,4.0,1.0,1.25,1.6,66.625,5.775,107.5
2,4,223.5,Angers,JULY,3,Granville,MONDAY,359.9,2016,1.0,1.0,1.0,1.5,25.5,4.4,171.0
3,5,237.5,Limoges,JULY,4,Saumur,TUESDAY,328.5,2016,1.0,1.0,1.0,1.2,182.0,5.6,170.0
4,6,216.0,Le Lioran,JULY,5,Limoges,WEDNESDAY,331.6,2016,6.0,1.0,2.166667,4.783333,155.416667,5.65,144.5


In [21]:
# pickle for later
pickle.dump(tdf1216,open('tdf_1216_geo.p','wb'))