## WEB SCRAPPING

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup


In [10]:
url = "https://www.hubertiming.com/results/2017GPTR"
html = urlopen(url)

In [11]:
soup = BeautifulSoup(html, 'lxml')
type(soup)


bs4.BeautifulSoup

In [12]:
# Get the title
title = soup.title
print(title)

<title>Race results for the 2017 Intel Great Place to Run \ Urban Clash Games!</title>


In [15]:
# Print out the text
text = soup.get_text()
#print(soup.text)


In [16]:
# Find all hyperlinks
soup.find_all('a')

[<a href="mailto:timing@hubertiming.com">timing@hubertiming.com</a>,
 <a href="https://www.hubertiming.com">Huber Timing Home</a>,
 <a class="btn btn-primary btn-lg" href="/results/2017GPTR10K" role="button" style="margin: 0px 0px 5px 5px"><i aria-hidden="true" class="fa fa-user"></i> 10K</a>,
 <a class="btn btn-primary btn-lg" href="/results/summary/2017GPTR" role="button" style="margin: 0px 0px 5px 5px"><i class="fa fa-stream"></i> Summary</a>,
 <a id="individual" name="individual"></a>,
 <a data-url="/results/2017GPTR" href="#tabs-1" id="rootTab" style="font-size: 18px">5K Results</a>,
 <a href="https://www.hubertiming.com/"><img height="65" src="https://www.hubertiming.com//sites/all/themes/hubertiming/images/clockWithFinishSign_small.png" width="50"/>Huber Timing</a>,
 <a href="https://facebook.com/hubertiming/"><img src="https://www.hubertiming.com/results/FB-f-Logo__blue_50.png"/></a>]

# Exercise 1: extract and only print out hyperlinks without attributes like class,src,etc.

In [17]:
all_links = soup.find_all('a')
for link in all_links:
    print(link.get('href'))

mailto:timing@hubertiming.com
https://www.hubertiming.com
/results/2017GPTR10K
/results/summary/2017GPTR
None
#tabs-1
https://www.hubertiming.com/
https://facebook.com/hubertiming/


In [18]:
# Print the first 10 rows for sanity check
rows = soup.find_all('tr')
print(rows[:10])


[<tr colspan="2">
<b>5K:</b>
</tr>, <tr>
<td>Finishers:</td>
<td>1458</td>
</tr>, <tr>
<td>Male:</td>
<td>771</td>
</tr>, <tr>
<td>Female:</td>
<td>687</td>
</tr>, <tr class="header">
<th>Place</th>
<th>Bib</th>
<th>Name</th>
<th>Gender</th>
<th>City</th>
<th>State</th>
<th>Time</th>
<th>Gun Time</th>
</tr>, <tr data-bib="2320">
<td>1</td>
<td>2320</td>
<td>

                    DANIEL M HINCKLEY

                </td>
<td>M</td>
<td>HILLSBORO</td>
<td>OR</td>
<td>16:42</td>
<td>16:44</td>
</tr>, <tr data-bib="2335">
<td>2</td>
<td>2335</td>
<td>

                    KORY F GRAY

                </td>
<td>M</td>
<td>HILLSBORO</td>
<td>OR</td>
<td>17:34</td>
<td>17:35</td>
</tr>, <tr data-bib="1770">
<td>3</td>
<td>1770</td>
<td>

                    FILIP SCHMOLE

                </td>
<td>M</td>
<td>PORTLAND</td>
<td>OR</td>
<td>18:13</td>
<td>18:14</td>
</tr>, <tr data-bib="2584">
<td>4</td>
<td>2584</td>
<td>

                    TRENTON C ROLLING

                </td>
<td>M</td>
<

In [19]:
# Print all the cells
for row in rows:
    row_td = row.find_all('td')
print(row_td)

[<td>1458</td>, <td>1400</td>, <td>

                    SUMALATHA PURMA

                </td>, <td>F</td>, <td>PORTLAND</td>, <td>OR</td>, <td>1:48:13</td>, <td>1:48:13</td>]


In [24]:
# Print all cells in rows without html tags
str_cells = str(row_td)
cell_text = BeautifulSoup(str_cells, 'lxml').get_text()
print(cell_text)

[1458, 1400, 

                    SUMALATHA PURMA

                , F, PORTLAND, OR, 1:48:13, 1:48:13]


# REGULAR EXPRESSIONS

In [25]:
import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
print(clean2)
type(clean2)

[1458, 1400, 

                    SUMALATHA PURMA

                , F, PORTLAND, OR, 1:48:13, 1:48:13]


str

# DATA MANIPULATING AND CLEANING

In [28]:
# Convert the list into a dataframe
df = pd.DataFrame(list_rows)
df.head(10)

Unnamed: 0,0
0,[]
1,"[Finishers:, 1458]"
2,"[Male:, 771]"
3,"[Female:, 687]"
4,[]
5,"[1, 2320, \r\n\r\n DANIEL M..."
6,"[2, 2335, \r\n\r\n KORY F G..."
7,"[3, 1770, \r\n\r\n FILIP SC..."
8,"[4, 2584, \r\n\r\n TRENTON ..."
9,"[5, 2688, \r\n\r\n YEAN-AN ..."


In [29]:
# Plit data by commna
df1 = df[0].str.split(',', expand=True)
df1.head(10)


Unnamed: 0,0,1,2,3,4,5,6,7
0,[],,,,,,,
1,[Finishers:,1458],,,,,,
2,[Male:,771],,,,,,
3,[Female:,687],,,,,,
4,[],,,,,,,
5,[1,2320,\r\n\r\n DANIEL M HINCKLEY...,M,HILLSBORO,OR,16:42,16:44]
6,[2,2335,\r\n\r\n KORY F GRAY\r\n\r...,M,HILLSBORO,OR,17:34,17:35]
7,[3,1770,\r\n\r\n FILIP SCHMOLE\r\n...,M,PORTLAND,OR,18:13,18:14]
8,[4,2584,\r\n\r\n TRENTON C ROLLING...,M,PORTLAND,OR,18:32,18:35]
9,[5,2688,\r\n\r\n YEAN-AN LIAO\r\n\...,M,HILLSBORO,OR,19:12,19:18]


In [30]:
df1[0] = df1[0].str.strip('[')
df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7
0,],,,,,,,
1,Finishers:,1458],,,,,,
2,Male:,771],,,,,,
3,Female:,687],,,,,,
4,],,,,,,,
5,1,2320,\r\n\r\n DANIEL M HINCKLEY...,M,HILLSBORO,OR,16:42,16:44]
6,2,2335,\r\n\r\n KORY F GRAY\r\n\r...,M,HILLSBORO,OR,17:34,17:35]
7,3,1770,\r\n\r\n FILIP SCHMOLE\r\n...,M,PORTLAND,OR,18:13,18:14]
8,4,2584,\r\n\r\n TRENTON C ROLLING...,M,PORTLAND,OR,18:32,18:35]
9,5,2688,\r\n\r\n YEAN-AN LIAO\r\n\...,M,HILLSBORO,OR,19:12,19:18]


In [34]:
df1.info()
df1.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1463 entries, 0 to 1462
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1463 non-null   object
 1   1       1461 non-null   object
 2   2       1458 non-null   object
 3   3       1458 non-null   object
 4   4       1458 non-null   object
 5   5       1458 non-null   object
 6   6       1458 non-null   object
 7   7       1458 non-null   object
dtypes: object(8)
memory usage: 91.6+ KB


(1463, 8)