In [1]:
import requests
from selenium.webdriver import (Chrome)
from string import ascii_lowercase
from pymongo import MongoClient
from bs4 import BeautifulSoup
import json
import sys
import pandas as pd
import time
from scipy.stats import ttest_ind, kruskal
import plotly.plotly as py
import plotly.graph_objs as go

In [2]:
def get_results(index_url):
    '''Starts web scraper to get table
    with runner results.  Returns column headers
    and result data.'''
    browser = Chrome()
    browser.get(index_url)
    sel = "gbox_list" 
    cascade100results=browser.find_element_by_id(sel) #setting a css selector
    table_rows = cascade100results.text.split('\n')
    runner_rows = [row.split() for row in table_rows]
    cols = runner_rows[0:10] #column headers
    content = runner_rows[11:] #the actual runner results
    return (cols, content)

In [3]:
root_url = 'http://ultrasignup.com'
index_url = root_url + '/results_event.aspx?did=41880'

In [4]:
cols, content = get_results(index_url=index_url)

In [5]:

#find index of Did Not Finish and Did Not Start
def find_idx(content):
    indices = []
    for idx, row in enumerate(content):
        if row[0] == 'Did':
            indices.append(idx)
    return indices

In [6]:
indices = find_idx(content)
indices

[109, 163]

In [7]:
finished = content[0:109]

In [8]:
DNF = content[111:163]

In [9]:
DNS = content[163:]

In [10]:
def make_finisher_df(content, idx_1):
    '''take in runner row content and the index
    indicating the end of the finishers list (idx_1).  
    return dataframe with the male and female times.'''
    finished = content[0:idx_1]
    gender = []
    times = []
    for row in finished:
        times.append(row[-2])
        gender.append(row[-4])
    df = pd.DataFrame({'Gender': gender, 'Time': times})
    df['Time'] = df['Time'].str.split(':')
    df['Time'] = df['Time'].apply(lambda x: int(x[0]) * 60 + int(x[1]) + float(x[2])/60)
    return df

In [11]:
cascade1002017 = make_finisher_df(content, indices[0])

In [12]:
males = cascade1002017[cascade1002017['Gender'] == 'M']

In [13]:
females = cascade1002017[cascade1002017['Gender'] == 'F']

In [14]:
females['Time'].mean()
females['Time'].sem()

32.17009942137171

In [15]:
males['Time'].mean()

1762.6929824561405

In [16]:
#t-test on males vs females finishing times
cascade_ttest2017 = ttest_ind(females['Time'], males['Time'], equal_var=False)

In [17]:
trace1 = [go.Bar(
    x=['Male', 'Female'],
    y=[males['Time'].mean(), females['Time'].mean()],
    name='Cascade Crest 100',
    error_y=dict(
        type='data',
        array=[males['Time'].sem(), females['Time'].sem()],
        visible=True
    )
)
         ]

In [18]:

py.iplot(trace1, filename='error-bar-bar')

In [19]:
def make_DNF_counts(DNF):
    '''Takes in the DNF list as an input
    and returns the male and female proportional counts.'''
    males = 0
    females = 0
    for row in DNF:
        if row[-3] == 'M':
            males +=1
        else:
            females += 1
    return males/len(DNF), females/len(DNF)


In [20]:
pct_male, pct_female = make_DNF_counts(DNF)

In [21]:
len(females)/len(content)

0.2

In [22]:
len(males)/len(content)

0.46060606060606063

In [23]:
len(males) + pct_male*len(DNF)

113.0

In [24]:
len(females) + pct_female*len(DNF)

48.0

In [25]:
48/(113+48) #30 pct of the entire field is female, 29% of DNFs are female

0.2981366459627329

In [26]:
113/(113+48) #70 pct of the entire field is male, 71% of the DNFs are male

0.7018633540372671

In [27]:
pct_female

0.28846153846153844

In [28]:
pct_male

0.7115384615384616

In [29]:
cc100_2018_url = root_url + '/results_event.aspx?did=51691'

In [30]:
cols_2, content_2 = get_results(cc100_2018_url)

In [31]:
find_idx(content_2)

[]

In [32]:
cascade1002018 = make_finisher_df(content_2, len(content_2))
len(cascade1002018)

146

In [33]:
males_2018 = cascade1002018[cascade1002018['Gender'] == 'M']

In [34]:
females_2018 = cascade1002018[cascade1002018['Gender'] == 'F']

In [35]:
#t-test on males vs females finishing times
cascade_ttest2018 = ttest_ind(females_2018['Time'], males_2018['Time'], equal_var=False)

In [36]:
trace4 = [go.Bar(
    x=['Male', 'Female'],
    y=[males_2018['Time'].mean(), females_2018['Time'].mean()],
    name='Cascade Crest 100 2018',
    error_y=dict(
        type='data',
        array=[males_2018['Time'].sem(), females_2018['Time'].sem()],
        visible=True
    )
)
         ]

In [37]:
py.iplot(trace4, filename='Cascade100_2018')

In [38]:
RRR100_2018_tortoises = root_url + '/results_event.aspx?did=60025'

In [39]:
cols_RRR, content_RRR = get_results(RRR100_2018_tortoises)

In [40]:
find_idx(content_RRR)

[]

In [41]:
RRR2018_tortoises = make_finisher_df(content_RRR, len(content_RRR))

In [42]:
male_tortoises = RRR2018_tortoises[RRR2018_tortoises['Gender'] == 'M']

In [43]:
female_tortoises = RRR2018_tortoises[RRR2018_tortoises['Gender'] == 'F']

In [44]:
RRR_tort_ttest = ttest_ind(female_tortoises['Time'], male_tortoises['Time'], equal_var=False)

In [45]:
len(RRR2018_tortoises)

166

In [46]:
trace2 = go.Bar(
    x=['Male', 'Female'],
    y=[male_tortoises['Time'].mean(), female_tortoises['Time'].mean()],
    name='RRR 100',
    error_y=dict(
        type='data',
        array=[male_tortoises['Time'].sem(), female_tortoises['Time'].sem()],
        visible=True
    )
)
         

In [47]:
data=go.Data([trace2])
layout=go.Layout(title="RRR Tortoises 100", xaxis={'title':'Gender'}, yaxis={'title':'Time(minutes)'})
figure=go.Figure(data=data,layout=layout)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.




In [48]:
py.iplot(figure, filename='error-bar-RRR', layout=layout)

In [49]:
hardrock100_url = root_url + '/results_event.aspx?did=51281'

In [50]:
cols_HR, content_HR = get_results(hardrock100_url)

In [51]:
indices = find_idx(content_HR)


In [52]:
HR_finishers = make_finisher_df(content_HR, indices[0])
len(HR_finishers)

113

In [53]:
DNF_HR = content_HR[indices[0]:]

In [54]:
HR_males = HR_finishers[HR_finishers["Gender"] == 'M']
HR_females = HR_finishers[HR_finishers["Gender"] == 'F']

In [55]:
HR_ttest = ttest_ind(HR_females['Time'], HR_males['Time'], equal_var=False)

In [56]:
def make_DNF_counts2(DNF):
    '''Takes in the DNF list as an input
    and returns the male and female proportional counts.'''
    males = 0
    females = 0
    for row in DNF:
        if row[-4] == 'M':
            males +=1
        else:
            females += 1
    return males/len(DNF), females/len(DNF)

In [57]:
DNF_HR_male, DNF_HR_female = make_DNF_counts2(DNF_HR)

In [58]:
DNF_HR_female

0.09090909090909091

In [59]:
DNF_HR_male

0.9090909090909091

In [60]:
len(HR_females)/len(HR_finishers)

0.09734513274336283

In [61]:
len(HR_males)/len(HR_finishers)

0.9026548672566371

In [62]:
HURT100url = root_url + '/results_event.aspx?did=56682'

In [63]:
cols_HURT, content_HURT = get_results(HURT100url)

In [64]:
idx_hurt = find_idx(content_HURT)
idx_hurt

[68, 130]

In [65]:
HURT_finishers = make_finisher_df(content_HURT, idx_hurt[0] )
len(HURT_finishers)

68

In [66]:
HURT_males = HURT_finishers[HURT_finishers["Gender"] == 'M']
HURT_females = HURT_finishers[HURT_finishers["Gender"] == 'F']

In [67]:
HURT_ttest = ttest_ind(HURT_females['Time'], HURT_males['Time'], equal_var=False)

In [68]:
JJ_url = root_url + '/results_event.aspx?did=53326'

In [69]:
JJ_cols, JJ_content = get_results(JJ_url)

In [70]:
idx_JJ = find_idx(JJ_content)

In [71]:
JJ_finishers = make_finisher_df(JJ_content, idx_JJ[0])
len(JJ_finishers)

367

In [72]:
JJ_males = JJ_finishers[JJ_finishers['Gender'] == 'M']
JJ_females = JJ_finishers[JJ_finishers['Gender'] == 'F']

In [73]:
JJ_ttest = ttest_ind(JJ_females['Time'], JJ_males['Time'], equal_var=False)
JJ_ttest

Ttest_indResult(statistic=5.590100276884938, pvalue=5.351719513378266e-08)

In [74]:
JJ_DNF = JJ_content[idx_JJ[0]:]


In [75]:
JJ_DNF_M, JJ_DNF_F = make_DNF_counts2(JJ_DNF) 

In [76]:
JJ_DNF_M

0.7142857142857143

In [77]:
((JJ_DNF_M*len(JJ_DNF))+len(JJ_males))/len(JJ_content)

0.6820603907637656

In [78]:
((JJ_DNF_F*len(JJ_DNF))+ len(JJ_females))/len(JJ_content)

0.31793960923623443

In [79]:
Orcas_url = root_url + '/results_event.aspx?did=47664'

In [80]:
Orcas_cols, Orcas_content = get_results(Orcas_url)

In [81]:
idx_Orcas = find_idx(Orcas_content)
idx_Orcas

[68, 91]

In [82]:
Orcas_finishers = make_finisher_df(Orcas_content, idx_Orcas[0])

In [83]:
Orcas_males = Orcas_finishers[Orcas_finishers['Gender'] == 'M']
Orcas_females = Orcas_finishers[Orcas_finishers['Gender'] == 'F']

In [84]:
Orcas_ttest = ttest_ind(Orcas_females['Time'], Orcas_males['Time'], equal_var=False)
Orcas_ttest

Ttest_indResult(statistic=1.0244482021226133, pvalue=0.3115368992817924)

In [85]:
#kruskal-wallis test of median differences
KW_Orcas = kruskal(Orcas_females['Time'], Orcas_males['Time'])
KW_Orcas

KruskalResult(statistic=0.28208214231983675, pvalue=0.5953397432309312)

In [86]:
len(Orcas_finishers)

68

In [87]:
DNF_Orcas = Orcas_content[idx_Orcas[0]:]

In [88]:
DNF_M_ORCAS, DNF_F_ORCAS = make_DNF_counts(DNF_Orcas)

In [89]:
len(Orcas_males)/len(Orcas_finishers)

0.75

In [90]:
len(Orcas_females)/len(Orcas_finishers)

0.25

In [91]:
(DNF_F_ORCAS*len(DNF_Orcas) + len(Orcas_females))/len(Orcas_content)

0.3163265306122449

In [92]:
(DNF_M_ORCAS*len(DNF_Orcas) + len(Orcas_males))/len(Orcas_content)

0.6836734693877551

In [93]:
trace3 = go.Bar(
    x=['Male', 'Female'],
    y=[Orcas_males['Time'].mean(), Orcas_females['Time'].mean()],
    name='RRR 100',
    error_y=dict(
        type='data',
        array=[Orcas_males['Time'].sem(), Orcas_females['Time'].sem()],
        visible=True
    )
)
   

In [94]:
data=go.Data([trace3])
layout=go.Layout(title="Orcas 100", xaxis={'title':'Gender'}, yaxis={'title':'Time(minutes)'})
figure=go.Figure(data=data,layout=layout)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.




In [95]:
py.iplot(figure, filename='error-bar-Orcas100', layout=layout)

In [96]:
wasatch_url = root_url + '/results_event.aspx?did=52224'

In [97]:
wasatch_cols, wasatch_content = get_results(wasatch_url)

In [98]:
idx_wasatch = find_idx(wasatch_content)

In [99]:
idx_wasatch

[186]

In [100]:
wasatch_finishers = make_finisher_df(wasatch_content, idx_wasatch[0])

In [101]:
wasatch_finishers_M = wasatch_finishers[wasatch_finishers["Gender"] == 'M']
wasatch_finishers_F = wasatch_finishers[wasatch_finishers["Gender"] == 'F']

In [102]:
wasatch_ttest = ttest_ind(wasatch_finishers_F['Time'], wasatch_finishers_M['Time'], equal_var=False)
wasatch_ttest

Ttest_indResult(statistic=0.5360641108726018, pvalue=0.5943608058547083)

In [103]:
len(wasatch_finishers)

186

In [104]:
bear_url = root_url + '/results_event.aspx?did=50900'

In [105]:
bear_cols, bear_content = get_results(bear_url)

In [106]:
idx_bear = find_idx(bear_content)
idx_bear

[220, 317]

In [107]:
bear_finishers = make_finisher_df(bear_content, idx_bear[0])

In [108]:
bear_M = bear_finishers[bear_finishers['Gender'] == 'M']
bear_F = bear_finishers[bear_finishers['Gender'] == 'F']

In [109]:
bear_ttest = ttest_ind(bear_F['Time'], bear_M['Time'], equal_var=False)
bear_ttest

Ttest_indResult(statistic=2.8621153843332463, pvalue=0.005218411498656486)

In [110]:
len(bear_finishers)

220

In [111]:
GA_url = root_url + '/results_event.aspx?did=48295'

In [112]:
GA_cols, GA_content = get_results(GA_url)

In [113]:
idx_GA = find_idx(GA_content)

In [114]:
GA_finishers = make_finisher_df(GA_content, idx_GA[0])

In [115]:
GA_M = GA_finishers[GA_finishers['Gender'] == 'M']
GA_F = GA_finishers[GA_finishers['Gender'] == 'F']

In [116]:
GA_ttest = ttest_ind(GA_F['Time'], GA_M['Time'], equal_var=False)
GA_ttest

Ttest_indResult(statistic=1.120759069440516, pvalue=0.26861039619818877)

In [117]:
len(GA_finishers)

168

In [127]:
frames = ([GA_finishers, Orcas_finishers, JJ_finishers, cascade1002017, cascade1002018, bear_finishers, wasatch_finishers,
         HURT_finishers, HR_finishers, WSER_finishers])

In [128]:
concatenated = pd.concat(frames, ignore_index=True)

In [129]:
concatenated_M = concatenated[concatenated['Gender'] == "M"]
concatenated_F = concatenated[concatenated['Gender'] == 'F']

In [130]:
concatenated_ttest = ttest_ind(concatenated_F['Time'], concatenated_M['Time'], equal_var=False)

In [131]:
concatenated_ttest

Ttest_indResult(statistic=0.7462965793805261, pvalue=0.45571627680154625)

In [132]:
len(concatenated)

1743

In [133]:
WSER_url = root_url + '/results_event.aspx?did=51243'

In [121]:
WSER_cols, WSER_content = get_results(WSER_url)


In [122]:
idx_WSER = find_idx(WSER_content)
idx_WSER

[298]

In [123]:
WSER_finishers = make_finisher_df(WSER_content, idx_WSER[0])

In [124]:
WSER_M = WSER_finishers[WSER_finishers['Gender'] == 'M']
WSER_F = WSER_finishers[WSER_finishers['Gender'] == 'F']

In [125]:
WSER_ttest = ttest_ind(WSER_F['Time'], WSER_M['Time'], equal_var=False)
WSER_ttest

Ttest_indResult(statistic=-0.5124951139890147, pvalue=0.6095880492932007)

In [134]:
trace4 = go.Bar(
    x=['Male', 'Female'],
    y=[concatenated_M['Time'].mean(), concatenated_F['Time'].mean()],
    name='Mean Finish Times for 100 mile (or similar) Races',
    error_y=dict(
        type='data',
        array=[concatenated_M['Time'].sem(), concatenated_F['Time'].sem()],
        visible=True
    )
)
   

In [135]:
data=go.Data([trace4])
layout=go.Layout(title="Concatenated Mean Finish Times for 100 mile (or similar) Races", xaxis={'title':'Gender'}, yaxis={'title':'Time (minutes)'})
figure=go.Figure(data=data,layout=layout)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.




In [136]:
py.iplot(figure, filename='Concatenated 100', layout=layout)

In [137]:
concatenated.head()

Unnamed: 0,Gender,Time
0,M,743.516667
1,M,746.883333
2,M,748.95
3,M,763.183333
4,M,799.483333


In [142]:
concatenated.columns

Index(['Gender', 'Time'], dtype='object')

In [145]:
sort_df = concatenated.sort_values(by=['Time'])

In [151]:
sort_df = sort_df.reset_index()

In [169]:
#analyzing the bottom 10% of WSER finishers

In [159]:
len(WSER_finishers)

298

In [165]:
bottom_10_WSER = WSER_finishers.iloc[-29:]

In [168]:
bottom_10_WSER.groupby(['Gender'])['Time'].mean()

Gender
F    1785.148148
M    1783.840000
Name: Time, dtype: float64