# Problem Statement
You now have a pretty varied suite of clustering and clustering evaluation methods; we'd be remiss if we didn't give you the opportunity to try them out on some real data. So here we go!

There is a lot of information on runners and their performance for the Boston Marathon. Pick a year (post-2012 has more info) and do some clustering.

Specifically, use the tools at hand to determine which clustering solution, including number of clusters and algorithm used, is best for the marathon data. Once you have a solution you like, write a data story, including visualizations, where you teach the reader something about the Boston Marathon based on your clusters. Write up your report, including your process from start to finish, in a Jupyter notebook and submit it below.

### Outline
- [data](#data)
- [clustering](#clustering)
- [results](#results)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AffinityPropagation, SpectralClustering

<a id = "data"></a>
# data

In [2]:
df = pd.read_csv("results.csv")
display(df.shape)
display(df.head())
display(df.columns)

(31984, 21)

Unnamed: 0,10k,name,division,25k,gender,age,official,bib,genderdiv,ctz,...,overall,pace,state,30k,5k,half,20k,country,city,40k
0,17.37,"Yamamoto, Hiroyuki",8,47.67,M,47,85.25,W1,8,,...,8,3.27,,59.18,8.02,39.72,37.65,JPN,Fukuoka,80.43
1,32.58,"Jeptoo, Rita",1,82.43,F,33,138.95,F1,1,,...,21,5.3,,99.33,16.22,69.47,65.83,KEN,Eldoret,132.1
2,16.62,"Van Dyk, Ernst F.",1,45.8,M,41,80.6,W2,1,,...,1,3.08,,56.45,7.75,38.03,36.1,RSA,Paarl,76.1
3,32.57,"Dibaba, Mare",3,82.43,F,24,140.58,F2,3,,...,27,5.37,,99.33,16.2,69.47,65.83,ETH,Shoa,132.95
4,17.12,"Hokinoue, Kota",2,46.37,M,40,81.23,W3,2,,...,2,3.1,,57.03,8.02,38.6,36.58,JPN,Nogata Fukuoka,76.72


Index(['10k', 'name', 'division', '25k', 'gender', 'age', 'official', 'bib',
       'genderdiv', 'ctz', '35k', 'overall', 'pace', 'state', '30k', '5k',
       'half', '20k', 'country', 'city', '40k'],
      dtype='object')

In [3]:
# null data
display(df.isnull().sum())

10k              0
name             0
division         0
25k              0
gender           0
age              0
official         0
bib              0
genderdiv        0
ctz          30740
35k              0
overall          0
pace             0
state         2576
30k              0
5k               0
half             0
20k              0
country          0
city             1
40k              0
dtype: int64

In [4]:
# categorical data
display(len(df['bib'].value_counts()))
display(df['bib'].value_counts()[:10])
display(len(df['ctz'].value_counts()))
display(df['ctz'].value_counts()[:10])
display(len(df['state'].value_counts()))
display(df['state'].value_counts()[:10])
display(len(df['country'].value_counts()))
display(df['country'].value_counts()[:10])
display(len(df['city'].value_counts()))
display(df['city'].value_counts()[:10])

31984

16452    1
2788     1
12274    1
3144     1
25340    1
12729    1
20171    1
2339     1
34533    1
13698    1
Name: bib, dtype: int64

84

GBR    171
CAN    143
USA    135
JPN     95
IRL     68
MEX     62
GER     55
FRA     45
AUS     43
CHN     39
Name: ctz, dtype: int64

68

MA    7587
CA    2326
NY    1553
ON    1052
PA    1001
TX     992
IL     916
OH     760
FL     750
VA     713
Name: state, dtype: int64

78

USA    27233
CAN     2175
GBR      346
ITA      212
MEX      205
GER      182
JPN      175
AUS      124
IRL      119
FRA      115
Name: country, dtype: int64

5934

Boston           1034
New York          498
Chicago           313
Cambridge         311
Toronto           240
Somerville        240
Brookline         222
Washington        212
Newton            201
San Francisco     192
Name: city, dtype: int64

In [5]:
for col in (df == '-').sum()[(df == '-').sum() > 0].index:
    df[col] = df[col].replace('-',None)
    df[col] = df[col].astype('float')

In [6]:
# drop name, bib, ctz, city
X = df.drop(['name', 'bib', 'ctz','city'], axis = 1)
# gender categorical to binary
X['gender'] = np.where(X['gender'] == 'M', 0,1) # Male -> 1
# OHE state, country
X = pd.get_dummies(X, columns=['state'], dummy_na=True)
X = pd.get_dummies(X, columns=['country'])
# normalize
X_norm = normalize(X)

In [7]:
X_train_full, X_train_empty = train_test_split(X_norm, test_size=0.5, random_state=42)
X1, X2 = train_test_split(X_train_full, test_size=0.5, random_state=42)
X3, X4 = train_test_split(X_train_empty, test_size=0.5, random_state=42)

In [8]:
display(X_norm.shape)
display(X_train_full.shape)
display(X1.shape)

(31984, 162)

(15992, 162)

(7996, 162)

<a id = "clustering"></a>
# clustering

In [11]:
# k means
def means_eval(n_clusters, X):
    km = KMeans(n_clusters=n_clusters, random_state=42)
    return km.fit(X)

In [12]:
# mean shift
def shift_eval(n_clusters,X):
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    return ms.fit(X)

In [13]:
# spectral clustering
def spectral_eval(n_clusters, X):
    sc = SpectralClustering(n_clusters=n_clusters)
    return sc.fit(X)

In [17]:
# affinity propagation
def affinity_eval(n_clusters, X):
    af = AffinityPropagation(damping = .9)
    return af.fit(X)

In [18]:
# evaluation
def silhouette(n_clusters):
    for model in [means_eval, shift_eval, spectral_eval, affinity_eval]:
        print ("\n\nEvaluating " + model.__name__)
        start_time = time.time()
        for Xn in [X1, X2, X3, X4]:
            m = model(n_clusters, Xn)
            print(metrics.silhouette_score(Xn, m.labels_, metric='euclidean'))
        print ("Runtime: {:0.2f}".format(time.time()-start_time))

<a id = "results"></a>
# results

In [19]:
silhouette(6)



Evaluating means_eval
0.45328549414047936
0.451738328655329
0.44957462572612206
0.4480346936911667
Runtime: 11.86


Evaluating shift_eval
0.5301679725627517
0.5251596052836541
0.5181598650981347
0.4236210306399296
Runtime: 13.64


Evaluating spectral_eval
0.40560879141063577
0.4192594957871901
0.4095917154176725
0.4160327500150753
Runtime: 63.86


Evaluating affinity_eval
0.5138205060125443
0.5151510221396919
0.504558506542542
0.5036574538937296
Runtime: 383.59
