In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import scipy
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering
from sklearn import metrics
from sklearn.preprocessing import normalize


%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import time

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [3]:
path = '/home/peter/Desktop/results.csv'
df = pd.read_csv(path)

df.shape

(31984, 21)

In [4]:
# Check for missing data
print("%s\n" % (df.isnull().sum()))

# Lets ensure that all columns are of the expected type object and int (or float)
print("\n%s\n" % (df.dtypes))

10k              0
name             0
division         0
25k              0
gender           0
age              0
official         0
bib              0
genderdiv        0
ctz          30740
35k              0
overall          0
pace             0
state         2576
30k              0
5k               0
half             0
20k              0
country          0
city             1
40k              0
dtype: int64


10k           object
name          object
division       int64
25k           object
gender        object
age            int64
official     float64
bib           object
genderdiv      int64
ctz           object
35k           object
overall        int64
pace         float64
state         object
30k           object
5k            object
half          object
20k           object
country       object
city          object
40k           object
dtype: object



In [5]:
times = ['5k','10k', '20k','half','25k', '30k', '35k','40k', 'official']
for leg in times:
    df = df.replace(to_replace='-', value='')
    df[leg] = df[leg].convert_objects(convert_numeric=True)

# drop features name, bib etc.
df2 = df[['5k','10k', '20k','half','25k', '30k', '35k','40k', 'official','age',  'overall', 'pace']]

df2.head()

Unnamed: 0,5k,10k,20k,half,25k,30k,35k,40k,official,age,overall,pace
0,8.02,17.37,37.65,39.72,47.67,59.18,71.4,80.43,85.25,47,8,3.27
1,16.22,32.58,65.83,69.47,82.43,99.33,116.37,132.1,138.95,33,21,5.3
2,7.75,16.62,36.1,38.03,45.8,56.45,67.42,76.1,80.6,41,1,3.08
3,16.2,32.57,65.83,69.47,82.43,99.33,116.37,132.95,140.58,24,27,5.37
4,8.02,17.12,36.58,38.6,46.37,57.03,67.83,76.72,81.23,40,2,3.1


In [6]:
df2['pace0-5'] = df2.loc[:,'5k']/5
df2['pace5-10'] = (df2.loc[:,'10k']-df2.loc[:,'5k'])/5
df2['pace10-20'] = (df2.loc[:,'20k']-df2.loc[:,'10k'])/10
df2['pace20-25'] = (df2.loc[:,'25k']-df2.loc[:,'20k'])/5
df2['pace25-30'] = (df2.loc[:,'30k']-df2.loc[:,'25k'])/5
df2['pace30-35'] = (df2.loc[:,'35k']-df2.loc[:,'30k'])/5
df2['pace35-40'] = (df2.loc[:,'40k']-df2.loc[:,'35k'])/5
df2['half2nd'] = (df2.loc[:,'official']-df2.loc[:,'half'])/21.1

df2 = df2.dropna()

df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Unnamed: 0,5k,10k,20k,half,25k,30k,35k,40k,official,age,overall,pace,pace0-5,pace5-10,pace10-20,pace20-25,pace25-30,pace30-35,pace35-40,half2nd
0,8.02,17.37,37.65,39.72,47.67,59.18,71.4,80.43,85.25,47,8,3.27,1.604,1.87,2.028,2.004,2.302,2.444,1.806,2.15782
1,16.22,32.58,65.83,69.47,82.43,99.33,116.37,132.1,138.95,33,21,5.3,3.244,3.272,3.325,3.32,3.38,3.408,3.146,3.292891
2,7.75,16.62,36.1,38.03,45.8,56.45,67.42,76.1,80.6,41,1,3.08,1.55,1.774,1.948,1.94,2.13,2.194,1.736,2.017536
3,16.2,32.57,65.83,69.47,82.43,99.33,116.37,132.95,140.58,24,27,5.37,3.24,3.274,3.326,3.32,3.38,3.408,3.316,3.370142
4,8.02,17.12,36.58,38.6,46.37,57.03,67.83,76.72,81.23,40,2,3.1,1.604,1.82,1.946,1.958,2.132,2.16,1.778,2.020379


In [7]:
df2.shape

X = pd.DataFrame(normalize(df2, axis = 0), columns=df2.columns)

In [8]:
clustorcolormap = {0 : 'red',
                   1 : 'cyan',
                   2 : 'blue',
                   3: 'maroon', 
                   4: 'black',
                   5: 'grey', 
                   6:'purple',
                   7:'orange'}

In [None]:
tic()
bandwidth = estimate_bandwidth(X.values, quantile=0.3)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)

mslabel_color = []
for l in ms.labels_:
    mslabel_color.append(clustorcolormap[l])

ms_n_clusters_ = len(np.unique(ms.labels_))

print("Number of clusters estimated by MeanShift: {}".format(ms_n_clusters_))
tac()

Number of clusters estimated by MeanShift: 7
Time passed: 0hour:1min:57sec


In [None]:
tic()
sc = SpectralClustering()
sc.fit(X)
tac()

In [None]:
sclabel_color = [clustorcolormap[l] for l in sc.labels_]

sc_n_clusters_ = len(np.unique(ms.labels_))

print("Number of clusters estimated by MeanShift: {}".format(sc_n_clusters_))

In [None]:
label_color

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(X['age'], X['half2nd'], c=label_color, alpha=0.4, edgecolors='black')
plt.xlim(X['age'].min(),X['age'].max())
plt.ylim(X['half2nd'].min(),X['half2nd'].max())
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(X['age'], X['overall'], c=label_color, alpha=0.4, edgecolors='black')
plt.xlim(X['age'].min(),X['age'].max())
plt.ylim(X['overall'].min(),X['overall'].max())
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(X['pace0-5'], X['pace35-40'], c=label_color, alpha=0.4, edgecolors='black')
plt.xlim(X['pace0-5'].min(),X['pace0-5'].max())
plt.ylim(X['pace35-40'].min(),X['pace35-40'].max())
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(X['official'], X['overall'], c=label_color, alpha=0.4, edgecolors='black')
plt.xlim(X['official'].min()*0.8, X['official'].max()*1.1)
plt.ylim(X['overall'].min()*0.8, X['overall'].max()*1.1)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(X['pace'], X['overall'], c=label_color, alpha=0.4, edgecolors='black')
plt.xlim(X['official'].min()*0.8, X['official'].max()*1.1)
plt.ylim(X['overall'].min()*0.8, X['overall'].max()*1.1)
plt.show()
