In [None]:
import requests
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
import re

%matplotlib inline

url = 'https://www.hubertiming.com/results/2017GPTR10K'

html = urlopen(url)

soup = bs(html,'lxml')

text = soup.get_text()

print('The dataset that I will use for analysis was taken from a 10K race that took place in Hillsboro,on June 2017.')
print('The objective of this anylysis is to find :')
print('1.The average finish time for the 10k runners')
print('2.If the runners\' finish times follow a normal distribution')
print('2.Any performance differences between males and females of various age groups')


In [None]:
# Print the first 10 rows 
rows = soup.find_all('tr')
print(rows[:10])

In [None]:
'''geting all table rows in list form  
   and then convert that list into a dataframe
   for easier manipulation '''

for row in rows:
    row_td = row.find_all('td')
print(row_td)



In [None]:
# Using regular expression to remove the html tags
list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
print(clean2)




In [None]:
''' converting the list into a dataframe 
    and getting a quick view of the first 10 rows using Pandas.
'''
df = pd.DataFrame(list_rows)
df.head(10)

In [None]:
# cleaning up the data to get the desired format
df1 = df[0].str.split(',', expand=True)
df1.head(10)

In [None]:
# Stripping out the square brackets surrounding each row

df1[0] = df1[0].str.strip('[')
df1.head(10)

In [None]:
# Getting all the table headers
col_labels = soup.find_all('th')

#  Extracting text in between html tags for table headers.

all_header = []
col_str = str(col_labels)
cleantext2 = bs(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)

In [None]:
# converting the list of headers into a pandas dataframe.

df2 = pd.DataFrame(all_header)
df2.head()

# splitting column "0" into multiple columns at the comma position for all rows.
df3 = df2[0].str.split(',', expand=True)
df3.head()

# concatenating the two data frames

frames = [df3, df1]

df4 = pd.concat(frames)
df4.head(10)

# assinging the first row to be the header

df5 = df4.rename(columns=df4.iloc[0])
df5.head()

In [None]:
# getting an overview of the data for analysis

df5.info()
df5.shape

# dropping all rows with any missing values.

df6 = df5.dropna(axis=0, how='any')

In [None]:
# dropping the replicated table header as the first row in df5

df7 = df6.drop(df6.index[0])

# more data cleaning by renaming the '[Place' and ' Team]' columns

df7.rename(columns={'[Place': 'Place'},inplace=True)
df7.rename(columns={' Team]': 'Team'},inplace=True)

# final data cleaning - removing the closing bracket for cells in the "Team" column.
df7['Team'] = df7['Team'].str.strip(']')
df7.head()

In [None]:
# converting the column "Chip Time" into just minutes.

time_list = df7[' Chip Time'].tolist()

time_mins = []

for i in time_list:
    i = i.strip(' ')
    if len(i)!=7:
        i = '0:' + i
    h, m, s = i.split(':')
    math = (int(h) * 3600 + int(m) * 60 + int(s))/60
    time_mins.append(math)
# print(time_mins)

''' converting the list back into a dataframe 
    and making a new column ("Runner_mins") for runner chip
    times expressed in just minutes.'''

df7['Runner_mins'] = time_mins
df7.head()

In [None]:
#  calculating statistics for numeric columns only in the dataframe.

print('The average chip time for all runners was ~60 mins.')
print('The fastest 10K runner finished in 36.35 mins.')
print('The slowest runner finished in 101.30 minutes.')

df7.describe(include=[np.number])



In [None]:
# A boxplot to visualize the summary statistics using pylab module

from pylab import rcParams
rcParams['figure.figsize'] = 15, 5

print('Below are data summary statistics for the runners shown in a boxplot.')
print('(Maximum, minimum, medium, first quartile, third quartile, including outliers)')

df7.boxplot(column='Runner_mins')
plt.grid(True, axis='y')
plt.ylabel('Chip Time')
plt.xticks([1], ['Runners'])

In [None]:
# distribution plot of runners' chip times plotted using the seaborn library.

x = df7['Runner_mins']
ax = sns.distplot(x, hist=True, kde=True, rug=False, color='m', bins=25, hist_kws={'edgecolor':'black'})
plt.show()

print('The ditribution of the runner\s chip times looks almost normal')

In [None]:
# Finding out whether there were any performance differences between males and females of various age groups.

f_fuko = df7.loc[df7[' Gender']==' F']['Runner_mins']
m_fuko = df7.loc[df7[' Gender']==' M']['Runner_mins']
sns.distplot(f_fuko, hist=True, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Female')
sns.distplot(m_fuko, hist=False, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Male')
plt.legend()

print('The distribution indicates that females were slower than males on average.')

In [None]:
# computing summary statistics for males and females separately using groupby() method

g_stats = df7.groupby(" Gender", as_index=True).describe()
print(g_stats)

print()
print('The average chip time for all females and males was ~66 mins and ~58 mins, respectively')

In [None]:
print('Side-by-side boxplot comparison of male and female finish times.')
print()

df7.boxplot(column='Runner_mins', by=' Gender')
plt.ylabel('Chip Time')
plt.suptitle("")

In [None]:
print('RECOMMENDATION:')
print('''
Since women perfomed poorer than the men, I would recommend that:
1. They make better training plans, typically ranging from 12 to 20 weeks
2. Aim to build their weekly mileage up to 50 miles over the four months leading up to race day
3. Hydrate well for several days leading up to your marathon. 
    ''')
print('''
The runners can improve their performance before the race by:
1. Hydrating well for several days leading up to your marathon.
2. Eating a simple, high-carbohydrate breakfast several hours before the start of the race.
3. Lathering up with a little Vaseline or BodyGlide in any areas vulnerable to chafing 
4. Getting to the starting line early, and if needed, get in the port-a-potty line 30–40 minutes before the official start time.
5. Not overdressing

The runners can improve their performance during the race by:
1. Starting slowly. It's easy to get caught up in race-day adrenaline, but starting too fast is a big rookie mistake. 
2. Not blazing by every aid station or try to drink from a cup while running full blast.
3. Enjoying the energy of the spectators.
    ''')
