In [47]:
import pandas as pd
import numpy as np
from collections import Counter
data = pd.read_csv('data/data.csv')
#data = pd.DataFrame.from_csv('data/data.csv')

## Remove 2013 Half Marathon Times from Training Set

In [48]:
halfMarathonId = data[data['Year']==2013].index.tolist()

In [49]:
for i in halfMarathonId:
    data = data.drop(i)

## Removing Unknown and Private participants

In [50]:
err0 = [3327, 5197, 23714, 26270]

In [51]:
unknown_id = []
for rid in err0:
    unknown_id.extend(data[data['Id']==rid].index.tolist())

print len(unknown_id)

for i in unknown_id:
    data = data.drop(i)

310


There were 310 rows where participants were unknown or private. These are removed from the training data.

## Get Id's where Age is 0 

In [52]:
zeroAge = data[data['Age'] == 0].index.tolist()

In [53]:
runnerId = [data.loc[x]['Id'] for x in zeroAge]

In [54]:
onceZero = []
for n in runnerId:
    if data[data['Id'] == n]['Age'].size == 1:
        onceZero.append(n)

The following runner id's represent instances where the age is unknown, but the id only appears once, therefore age cannot be corrected: onceZero = [16673,16954,17902,18202]. To deal with these, the unknown age of the id was set to the average age of all runners who ranked within a range of +/- 50 of the runner in that particular year.

In [55]:
def correctSingleInstanceAge(runnerId,data):
    rank = int(data[data['Id'] == runnerId]['Rank'])
    year = int(data[data['Id'] == runnerId]['Year'])
    rangeOfRunners = data[(data['Rank']>rank-50) & (data['Rank']<rank+50)& (data['Year'] == year)]
    avgAge = rangeOfRunners['Age'].sum()/rangeOfRunners['Age'].size

    index = data[data['Id']==runnerId].index
    data.set_value(index, 'Age', avgAge)
    return


In [56]:
for n in onceZero:
    runnerId.remove(n)
    correctSingleInstanceAge(n,data)

In [57]:
i_pairs = []
for rid in runnerId:
    zeroYear = data[(data['Id'] == rid) & (data['Age']==0)].index.tolist()[0]
    otherYear = data[(data['Id'] == rid) & (data['Age']!=0)].index.tolist()[0]
    i_pairs.append((otherYear,zeroYear))

For the remaining runner Ids, the participant ran in at least one other year where the age information is recorded. By taking the difference in the years, this offset is added to the recorded age.

In [58]:
def offsetAge(idRecorded,idError,data):
    ageRecorded = data.loc[idRecorded]['Age']
    yearRecorded = data.loc[idRecorded]['Year']
    yearError = data.loc[idError]['Year']
    offset = yearError - yearRecorded
    data.set_value(idError, 'Age', ageRecorded+offset)
    return

In [59]:
for pair in i_pairs:
    offsetAge(pair[0],pair[1],data)

By this stage, all instances of data where Age == 0 have been corrected.

# Minimum age of Miami Marathon is 11. Check outliers

## Smallest Age

In [60]:
data['Age'].min()

10

In [61]:
ten = data[data['Age']==10]['Id'].tolist()
for rid in ten:
    if (data[data['Id']==rid]['Age'].size > 1):
        print 'Must correct runner id',rid
    else:
        continue

For the rows with age == 10, these runners do not run in any other years. Simply remove these data points and consider them as outliers. 

In [62]:
ageoutliers = data[data['Age']==10].index.tolist()

In [63]:
ageoutliers

[12096, 24513]

In [64]:
for i in ageoutliers:
    data = data.drop(i)

## Age distribution

In [65]:
ages = data['Age']
ageCount = Counter(ages)

## Old Outliers

In [66]:
old_rid = list(set(data[data['Age']>78]['Id'].tolist()))
for rid in old_rid:
    continue
    #print data[data['Id']==rid].sort_values(by='Year')

The entry in 2007 for 5143 can be corrected using offsetAge. The other single instance where Age is 98 (runner 10179 in 2007) can be perceived as an outlier outside of the distribution of regular data values. It is therefore dropped. Note that with only 2 outlier points, keeping them would not greatly affect training data.

In [67]:
offsetAge(877,8773,data)
data = data.drop(15534)

## Calculate 2017 Age based on cleaned data

In [70]:
ages_for_2017 = {}
uniqueids = list(set(data['Id'].values.tolist()))

for rid in uniqueids:
    runner = data[data['Id']==rid]
    latestYear = max(runner['Year'].values.tolist())
    latestAge = runner[data['Year'] == latestYear]['Age'].values[0]
    newAge = 2017-latestYear+latestAge
    ages_for_2017[str(rid)] = newAge



## Force Ages to increments of 5

In [75]:
indices = data['Age'].index.tolist()
for i in indices:
    data.loc[i]['Age'] = data.loc[i]['Age'] - data.loc[i]['Age']%5

## Add weather data
Temperature in deg. C at 6:53 or 6:59 AM

In [77]:
temp = {
    '2003':10.6,
    '2004':20.6,
    '2005':18.9,
    '2006':21.1,
    '2007':22.2,
    '2008':17.8,
    '2009':14.4,
    '2010':21.7,
    '2011':11.1,
    '2012':20.6,
    '2013':19.4,
    '2014':22.8,
    '2015':11.7,
    '2016':12.2
}

In [78]:
data['Temp'] = 0.

In [79]:
for i in data.index.tolist():
    year = str(int(data.loc[i]['Year']))
    data.set_value(i,'Temp',temp[year])

## 2013 Projections using Riegel's formula
http://cs229.stanford.edu/proj2015/247_report.pdf

In [25]:
#def fullMarathonTime(halfMarathonTime,data):
#    return np.round(halfMarathonTime*((26.219/13.095)**1.06))

In [26]:
#halfMarathonId = data[data['Year']==2013].index.tolist()
#data['halfMarathonTime'] = np.NaN

#### Recalculate Pace of 2013 rows using Full Marathon Time

In [27]:
#for i in halfMarathonId:
#    halftime = data.loc[i]['Time']
#    data.set_value(i,'halfMarathonTime',halftime)
#    fulltime = int(fullMarathonTime(halftime,data))
#    data.set_value(i,'Time',fulltime)

#    data.set_value(i,'Pace',fulltime/26.219)

## Remove instances of same ID, same year, multiple records
errRunnerIds1 for instances where >2 rows in a given year correspond to the same runner
errRunnerIds2 for instances where 2 rows in a given year correspond to the same runner

In [83]:
runnerIds = set(data['Id'].tolist())

errRunnerIds = []

for rid in runnerIds:
    runnerData = data[data['Id']==rid]
    
    if(runnerData['Age'].size > 1):
        yearCount = Counter(runnerData['Year'])
        #print yearCount
        if max(yearCount.values()) >= 2:
            errRunnerIds.append(rid)

In [84]:
count = 0

In [85]:
for err in errRunnerIds:
    runnerData = data[data['Id']==err]
    yearCount = Counter(runnerData['Year'])
    ids = []
    for y in yearCount.keys():
        if yearCount[y] > 1:
            idsSameYear=runnerData[runnerData['Year']==y].index.tolist()
            ids.extend(idsSameYear)
            count+= len(idsSameYear)
    for i in ids:
        data=data.drop(i)
print count

This eliminates 182 rows where a single runner ran multiple times in a given year.

## Cross-Validate Pace/Time/Rank

In [89]:
years = sorted(set(data['Year']))

for y in years:
    currentYear = data[data['Year']==y].sort_values(by='Rank')

    prevTime = currentYear[:1]['Time'].values[0]
    prevPace = currentYear[:1]['Pace'].values[0]
    prevRank = currentYear[:1]['Rank'].values[0]

    for row in currentYear[1:].iterrows():
        if ((row[1]['Pace'] < prevPace) or (row[1]['Rank'] < prevRank) or (row[1]['Time'] < prevTime)):
            print row[1]['Id'], row[1]['Year']
        prevPace = row[1]['Pace']
        prevRank = row[1]['Rank']
        prevTime = row[1]['Time']

The print statement does not output anything. This validates that the Rank, Time and Pace data is all in order, and that all three increase together.

## Verify distance of the marathon using Pace, Time

In [90]:
years = sorted(set(data['Year']))

for y in years:
    currentYear = data[data['Year']==y].sort_values(by='Rank')
    
    for row in currentYear.iterrows():
        if not(26.1365 < (row[1]['Time']/row[1]['Pace']) and (row[1]['Time']/row[1]['Pace']) < 26.2927):
            print row[1]['Id'], row[1]['Year']

Nothing is outputted. All distances based on the recorded Time and Pace are found to be between 26.1365 and 26.2927 miles. 

## Years since Last Race

In [91]:
data['yrsSinceLast'] = 15

In [92]:
def yearsSinceLastRace(runnerId,year,data):
    yrs = 15
    runnerAllYears = data[(data['Id']==runnerId) & (data['Year']<year)].sort_values(by='Year')['Year']
    if( runnerAllYears.size > 0 ):
        yrs = year - max(runnerAllYears)
    return yrs
    

In [93]:
for i in data.index.tolist():
    runnerId = data.loc[i]['Id']
    year = data.loc[i]['Year']
    #data.set_value(i,'totalNumRaces',count)
    data.set_value(i,'yrsSinceLast',int(yearsSinceLastRace(runnerId,year,data)))

## Add Number of Races
Number of races run since 2003

In [94]:
data['raceCount'] = 0

In [95]:
for i in data.index.tolist():
    runnerId = data.loc[i]['Id']
    count = int(data[data['Id']==runnerId]['Id'].size)
    data.set_value(i,'raceCount',count)

## Add Number of Races Ran to date (prev. count)

In [96]:
data['prevRaceCount'] = 0

In [97]:
for i in data.index.tolist():
    runnerId = data.loc[i]['Id']
    d = data[data['Id']==runnerId]

    if d['Id'].size > 1:
        runnerYear = data.loc[i]['Year']
        prevCount = d[d['Year']<runnerYear]['Year'].size
        data.set_value(i,'prevRaceCount',prevCount)

## Normalize data

### Get max values per category per year

In [None]:
ages = set(data['Age'].values)
agemin = min(ages)
agemax = max(ages)

tempbounds = temp.values()
tempmax = max(tempbounds)
tempmin = min(tempbounds)

yearmax = 2017.
yearmin = 2003.

In [100]:
normBounds = {}
for y in sorted(set(data['Year'])):
    d = data[data['Year']==y]
    rankbounds = [max(d['Rank'].values),min(d['Rank'].values)]
    timebounds = [max(d['Time'].values),min(d['Time'].values)]
    pacebounds = [max(d['Pace'].values),min(d['Pace'].values)]
    normBounds[str(y)] = [rankbounds, timebounds, pacebounds]
    
normBounds['Age'] = [agemax, agemin]
normBounds['Temp'] = [tempmax,tempmin]
normBounds['Year'] = [yearmax,yearmin]

### Initialize normalized columns

In [101]:
data['nTime'] = 0.
data['nPace'] = 0.
data['nRank'] = 0.
data['nAge']  = 0.
data['nTemp'] = 0.
data['nYear'] = 0.
data['nTotalCount'] = 0.
data['nPrevCount'] = 0.
data['nLast'] = 0.

In [102]:
for i in data.index.tolist():
    year = str(int(data.loc[i]['Year']))
    rank = data.loc[i]['Rank']
    pace = data.loc[i]['Pace']
    time = data.loc[i]['Time']
    age = data.loc[i]['Age']
    temp = data.loc[i]['Temp']
    
    yearsSinceLast = data.loc[i]['yrsSinceLast']
    raceCount = data.loc[i]['raceCount']
    prevCount = data.loc[i]['prevRaceCount']
    
    data.set_value(i,'nTime',((rank - normBounds[year][0][1])/normBounds[year][0][0]))
    data.set_value(i,'nPace',((time - normBounds[year][1][1])/normBounds[year][1][0]))
    data.set_value(i,'nRank',((pace - normBounds[year][2][1])/normBounds[year][2][0]))
    data.set_value(i,'nTemp',((temp - normBounds['Temp'][1])/normBounds['Temp'][0]))
    data.set_value(i,'nAge',((age - normBounds['Age'][1])/normBounds['Age'][0]))
    data.set_value(i,'nYear',((int(year) - normBounds['Year'][1])/normBounds['Year'][0]))

    data.set_value(i,'nTotalCount',(raceCount/15.))
    data.set_value(i,'nPrevCount',(prevCount/14.))
    data.set_value(i,'nLast',(yearsSinceLast/15.))

### Average of previous race times

In [103]:
data['histTime'] = 0.

In [104]:
for i in data.index.tolist():
    runnerId = data.loc[i]['Id']
    year = data.loc[i]['Year']
    prevYears = data[(data['Id']==runnerId) & (data['Year']<year)]
    
    if( prevYears['Age'].size > 0):
        historicalTimeAvg = prevYears['nTime'].sum()/data.loc[i]['prevRaceCount']
        data.set_value(i,'histTime',historicalTimeAvg)

# Data is cleaned.
Save data in .pkl file. Split Y1 and Y2 data sets. Randomize data. 

In [105]:
import pickle
f = open('data/out.pkl', 'w')
pickle.dump(data,f) 
f.close()

In [106]:
g = open('data/ages.pkl','w')
pickle.dump(ages_for_2017,g)
g.close()

## Save in csv

In [107]:
data.to_csv('data/out.csv')