In [166]:
import requests
from selenium.webdriver import (Chrome)
from string import ascii_lowercase
from pymongo import MongoClient
from bs4 import BeautifulSoup
import json
import sys
import pandas as pd
import time
from scipy.stats import ttest_ind, kruskal
import plotly.plotly as py
import plotly.graph_objs as go

In [14]:

r = requests.get(index_url)  # Fetch the web page
html = r.content  # Store the HTML content as a string

In [15]:
soup = BeautifulSoup(html, 'html.parser')

In [18]:
print (soup.title)

<title>
	2017 Cascade Crest 100 Miler - Results
</title>


In [20]:
for a in soup.findAll('link'):
    print (a['href'])


/favicon.ico
/css/jquery-ui-1.10.3.custom.min.css
/content/acorn_css?v=NuZ1MsCSr_VTh4pgnSm7aC412gvnZ1GFo_FIpsbk7Fo1
//maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css
/themes/ui.jqgrid.css


In [7]:
def get_results(index_url):
    '''Starts web scraper to get table
    with runner results.  Returns column headers
    and result data.'''
    browser = Chrome()
    browser.get(index_url)
    sel = "gbox_list" 
    cascade100results=browser.find_element_by_id(sel) #setting a css selector
    table_rows = cascade100results.text.split('\n')
    runner_rows = [row.split() for row in table_rows]
    cols = runner_rows[0:10] #column headers
    content = runner_rows[11:] #the actual runner results
    return (cols, content)

In [10]:
root_url = 'http://ultrasignup.com'
index_url = root_url + '/results_event.aspx?did=41880'

In [11]:
cols, content = get_results(index_url=index_url)

In [13]:
content

[['results',
  '2',
  'Matthew',
  'Urbanski',
  'Seattle',
  'WA',
  '36',
  'M',
  '2',
  '20:11:25',
  '85.67'],
 ['results',
  '3',
  'Ben',
  'Koss',
  'San',
  'Francisco',
  'CA',
  '37',
  'M',
  '3',
  '22:09:00',
  '89.34'],
 ['results',
  '4',
  'Jesse',
  'Rickert',
  'Gunnison',
  'CO',
  '45',
  'M',
  '4',
  '22:10:27',
  '89.76'],
 ['results',
  '5',
  'Kaytlyn',
  'Gerbin',
  'Issaquah',
  'WA',
  '28',
  'F',
  '1',
  '22:22:45',
  '95.69'],
 ['results',
  '6',
  'Michael',
  'Hauser',
  'Olympia',
  'WA',
  '27',
  'M',
  '5',
  '23:21:03',
  '76.04'],
 ['results',
  '7',
  'Ashley',
  'Nordell',
  'Sisters',
  'OR',
  '37',
  'F',
  '2',
  '23:21:07',
  '97.99'],
 ['results',
  '8',
  'Phil',
  'Shaw',
  'Seattle',
  'WA',
  '31',
  'M',
  '6',
  '23:22:54',
  '91.2'],
 ['results',
  '9',
  'Kedric',
  'Osborne',
  'Klamath',
  'Falls',
  'OR',
  '40',
  'M',
  '7',
  '23:27:27',
  '87.69'],
 ['results',
  '10',
  'Nicholas',
  'Hanson',
  'Kalamazoo',
  'MI',
  '35

In [17]:

#find index of Did Not Finish and Did Not Start
def find_idx(content):
    indices = []
    for idx, row in enumerate(content):
        if row[0] == 'Did':
            indices.append(idx)
    return indices

In [19]:
indices = find_idx(content)
indices

[109, 163]

In [20]:
finished = content[0:109]

In [21]:
DNF = content[111:163]

In [22]:
DNS = content[163:]

In [25]:
def make_finisher_df(content, idx_1):
    '''take in runner row content and the index
    indicating the end of the finishers list (idx_1).  
    return dataframe with the male and female times.'''
    finished = content[0:idx_1]
    gender = []
    times = []
    for row in finished:
        times.append(row[-2])
        gender.append(row[-4])
    df = pd.DataFrame({'Gender': gender, 'Time': times})
    df['Time'] = df['Time'].str.split(':')
    df['Time'] = df['Time'].apply(lambda x: int(x[0]) * 60 + int(x[1]) + float(x[2])/60)
    return df

In [26]:
cascade1002017 = make_finisher_df(content, indices[0])

In [27]:
males = cascade1002017[cascade1002017['Gender'] == 'M']

In [28]:
females = cascade1002017[cascade1002017['Gender'] == 'F']

In [29]:
females['Time'].mean()
females['Time'].sem()

32.17009942137171

In [30]:
males['Time'].mean()

1762.6929824561405

In [33]:
#t-test on males vs females finishing times
cascade_ttest2017 = ttest_ind(females['Time'], males['Time'], equal_var=False)

Ttest_indResult(statistic=0.8494337393385061, pvalue=0.3985578448626709)

In [34]:
trace1 = [go.Bar(
    x=['Male', 'Female'],
    y=[males['Time'].mean(), females['Time'].mean()],
    name='Cascade Crest 100',
    error_y=dict(
        type='data',
        array=[males['Time'].sem(), females['Time'].sem()],
        visible=True
    )
)
         ]

In [35]:

py.iplot(trace1, filename='error-bar-bar')

In [37]:
def make_DNF_counts(DNF):
    '''Takes in the DNF list as an input
    and returns the male and female proportional counts.'''
    males = 0
    females = 0
    for row in DNF:
        if row[-3] == 'M':
            males +=1
        else:
            females += 1
    return males/len(DNF), females/len(DNF)


In [44]:
pct_male, pct_female = make_DNF_counts(DNF)

In [39]:
len(females)/len(content)

0.2

In [41]:
len(males)/len(content)

0.46060606060606063

In [45]:
len(males) + pct_male*len(DNF)

113.0

In [46]:
len(females) + pct_female*len(DNF)

48.0

In [48]:
48/(113+48) #30 pct of the entire field is female, 29% of DNFs are female

0.2981366459627329

In [49]:
113/(113+48) #70 pct of the entire field is male, 71% of the DNFs are male

0.7018633540372671

In [51]:
pct_female

0.28846153846153844

In [52]:
pct_male

0.7115384615384616

In [53]:
cc100_2018_url = root_url + '/results_event.aspx?did=51691'

In [54]:
cols_2, content_2 = get_results(cc100_2018_url)

In [55]:
find_idx(content_2)

[]

In [181]:
cascade1002018 = make_finisher_df(content_2, len(content_2))
len(cascade1002018)

146

In [60]:
males_2018 = cascade1002018[cascade1002018['Gender'] == 'M']

In [61]:
females_2018 = cascade1002018[cascade1002018['Gender'] == 'F']

In [62]:
#t-test on males vs females finishing times
cascade_ttest2018 = ttest_ind(females_2018['Time'], males_2018['Time'], equal_var=False)

Ttest_indResult(statistic=0.16824113676306918, pvalue=0.8674406192664971)

In [176]:
trace4 = [go.Bar(
    x=['Male', 'Female'],
    y=[males_2018['Time'].mean(), females_2018['Time'].mean()],
    name='Cascade Crest 100 2018',
    error_y=dict(
        type='data',
        array=[males_2018['Time'].sem(), females_2018['Time'].sem()],
        visible=True
    )
)
         ]

In [177]:
py.iplot(trace4, filename='Cascade100_2018')

In [63]:
RRR100_2018_tortoises = root_url + '/results_event.aspx?did=60025'

In [64]:
cols_RRR, content_RRR = get_results(RRR100_2018_tortoises)

In [65]:
find_idx(content_RRR)

[]

In [66]:
RRR2018_tortoises = make_finisher_df(content_RRR, len(content_RRR))

In [67]:
male_tortoises = RRR2018_tortoises[RRR2018_tortoises['Gender'] == 'M']

In [68]:
female_tortoises = RRR2018_tortoises[RRR2018_tortoises['Gender'] == 'F']

In [69]:
RRR_tort_ttest = ttest_ind(female_tortoises['Time'], male_tortoises['Time'], equal_var=False)

Ttest_indResult(statistic=2.129305498390613, pvalue=0.039445612238984275)

In [178]:
len(RRR2018_tortoises)

166

In [79]:
trace2 = go.Bar(
    x=['Male', 'Female'],
    y=[male_tortoises['Time'].mean(), female_tortoises['Time'].mean()],
    name='RRR 100',
    error_y=dict(
        type='data',
        array=[male_tortoises['Time'].sem(), female_tortoises['Time'].sem()],
        visible=True
    )
)
         

In [80]:
data=go.Data([trace2])
layout=go.Layout(title="RRR Tortoises 100", xaxis={'title':'Gender'}, yaxis={'title':'Time(minutes)'})
figure=go.Figure(data=data,layout=layout)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.




In [81]:
py.iplot(figure, filename='error-bar-RRR', layout=layout)

In [82]:
hardrock100_url = root_url + '/results_event.aspx?did=51281'

In [83]:
cols_HR, content_HR = get_results(hardrock100_url)

In [86]:
indices = find_idx(content_HR)


In [182]:
HR_finishers = make_finisher_df(content_HR, indices[0])
len(HR_finishers)

113

In [91]:
DNF_HR = content_HR[indices[0]:]

In [89]:
HR_males = HR_finishers[HR_finishers["Gender"] == 'M']
HR_females = HR_finishers[HR_finishers["Gender"] == 'F']

In [90]:
HR_ttest = ttest_ind(HR_females['Time'], HR_males['Time'], equal_var=False)

Ttest_indResult(statistic=-0.1543399387113303, pvalue=0.8799521753311719)

In [96]:
def make_DNF_counts2(DNF):
    '''Takes in the DNF list as an input
    and returns the male and female proportional counts.'''
    males = 0
    females = 0
    for row in DNF:
        if row[-4] == 'M':
            males +=1
        else:
            females += 1
    return males/len(DNF), females/len(DNF)

In [97]:
DNF_HR_male, DNF_HR_female = make_DNF_counts2(DNF_HR)

In [98]:
DNF_HR_female

0.09090909090909091

In [99]:
DNF_HR_male

0.9090909090909091

In [100]:
len(HR_females)/len(HR_finishers)

0.09734513274336283

In [101]:
len(HR_males)/len(HR_finishers)

0.9026548672566371

In [102]:
HURT100url = root_url + '/results_event.aspx?did=56682'

In [103]:
cols_HURT, content_HURT = get_results(HURT100url)

In [106]:
idx_hurt = find_idx(content_HURT)
idx_hurt

[68, 130]

In [183]:
HURT_finishers = make_finisher_df(content_HURT, idx_hurt[0] )
len(HURT_finishers)

68

In [109]:
HURT_males = HURT_finishers[HURT_finishers["Gender"] == 'M']
HURT_females = HURT_finishers[HURT_finishers["Gender"] == 'F']

In [111]:
HURT_ttest = ttest_ind(HURT_females['Time'], HURT_males['Time'], equal_var=False)

Ttest_indResult(statistic=1.8014922204184567, pvalue=0.08120005920076435)

In [152]:
JJ_url = root_url + '/results_event.aspx?did=53326'

In [153]:
JJ_cols, JJ_content = get_results(JJ_url)

In [154]:
idx_JJ = find_idx(JJ_content)

In [184]:
JJ_finishers = make_finisher_df(JJ_content, idx_JJ[0])
len(JJ_finishers)

367

In [156]:
JJ_males = JJ_finishers[JJ_finishers['Gender'] == 'M']
JJ_females = JJ_finishers[JJ_finishers['Gender'] == 'F']

In [157]:
JJ_ttest = ttest_ind(JJ_females['Time'], JJ_males['Time'], equal_var=False)
JJ_ttest

Ttest_indResult(statistic=5.590100276884938, pvalue=5.351719513378266e-08)

In [158]:
JJ_DNF = JJ_content[idx_JJ[0]:]


In [159]:
JJ_DNF_M, JJ_DNF_F = make_DNF_counts2(JJ_DNF) 

In [163]:
JJ_DNF_M

0.7142857142857143

In [161]:
((JJ_DNF_M*len(JJ_DNF))+len(JJ_males))/len(JJ_content)

0.6820603907637656

In [162]:
((JJ_DNF_F*len(JJ_DNF))+ len(JJ_females))/len(JJ_content)

0.31793960923623443

In [121]:
Orcas_url = root_url + '/results_event.aspx?did=47664'

In [122]:
Orcas_cols, Orcas_content = get_results(Orcas_url)

In [129]:
idx_Orcas = find_idx(Orcas_content)
idx_Orcas

[68, 91]

In [125]:
Orcas_finishers = make_finisher_df(Orcas_content, idx_Orcas[0])

In [126]:
Orcas_males = Orcas_finishers[Orcas_finishers['Gender'] == 'M']
Orcas_females = Orcas_finishers[Orcas_finishers['Gender'] == 'F']

In [128]:
Orcas_ttest = ttest_ind(Orcas_females['Time'], Orcas_males['Time'], equal_var=False)
Orcas_ttest

Ttest_indResult(statistic=1.0244482021226133, pvalue=0.3115368992817924)

In [171]:
#kruskal-wallis test of median differences
KW_Orcas = kruskal(Orcas_females['Time'], Orcas_males['Time'])
KW_Orcas

KruskalResult(statistic=0.28208214231983675, pvalue=0.5953397432309312)

In [175]:
len(Orcas_finishers)

68

In [130]:
DNF_Orcas = Orcas_content[idx_Orcas[0]:]

In [136]:
DNF_M_ORCAS, DNF_F_ORCAS = make_DNF_counts(DNF_Orcas)

In [134]:
len(Orcas_males)/len(Orcas_finishers)

0.75

In [135]:
len(Orcas_females)/len(Orcas_finishers)

0.25

In [139]:
(DNF_F_ORCAS*len(DNF_Orcas) + len(Orcas_females))/len(Orcas_content)

0.3163265306122449

In [140]:
(DNF_M_ORCAS*len(DNF_Orcas) + len(Orcas_males))/len(Orcas_content)

0.6836734693877551

In [172]:
trace3 = go.Bar(
    x=['Male', 'Female'],
    y=[Orcas_males['Time'].mean(), Orcas_females['Time'].mean()],
    name='RRR 100',
    error_y=dict(
        type='data',
        array=[Orcas_males['Time'].sem(), Orcas_females['Time'].sem()],
        visible=True
    )
)
   

In [173]:
data=go.Data([trace3])
layout=go.Layout(title="Orcas 100", xaxis={'title':'Gender'}, yaxis={'title':'Time(minutes)'})
figure=go.Figure(data=data,layout=layout)


plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.




In [174]:
py.iplot(figure, filename='error-bar-Orcas100', layout=layout)