# WebScraping Practice

### Import all required libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

#### Connecting to the link provided by Keith Galli
#### this is the link
<blockquote> https://keithgalli.github.io/web-scraping/webpage.html </blockquote>

In [2]:
link = "https://keithgalli.github.io/web-scraping/webpage.html"

In [3]:
#requests implements browser style ssl verification by default
try:
    r = requests.get(link)
    r.raise_for_status()
    
except requests.exceptions.HTTPError as err404:
    #will occur if the webpage doesn't exist
    raise SystemExit(err404)
except requests.exceptions.ConnectionError as errc:
    #will occur if there is an error connecting to a webpage
    raise SystemExit(errc)
except requests.exceptions.RequestException as e:
    raise SystemExit(e)


### Convert response to beautiful soup object

In [4]:
soup = bs(r.content, features = "html.parser")


In [5]:

#collecting the body
body = soup.body



#### Grab all of the social Links from the webpage


###### Do this in 3 different ways

In [6]:
social = body.find('ul', class_ = 'socials')
social = social.find_all('a')
social_links = [link.get('href') for link in social]
print(social_links)


['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


In [7]:
social2 = body.find_all('li', class_= re.compile('social'))

social_links2 = [link.a.get('href') for link in social2]
print(social_links2)

['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


In [8]:
social3 = body.select('li.social a')

social_links3 = [link.get('href') for link in social3]
print(social_links2)


['https://www.instagram.com/keithgalli/', 'https://twitter.com/keithgalli', 'https://www.linkedin.com/in/keithgalli/', 'https://www.tiktok.com/@keithgalli']


### Scrape the table

In [9]:
table = body.select('table.hockey-stats')[0]
print(table)

<table class="hockey-stats">
<thead>
<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>
</thead>
<tbody>
<tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team"

In [10]:
col_names = table.find('thead').find_all('th')
col_names = [name.string for name in col_names]

print(col_names)

['S', 'Team', 'League', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', '\xa0', 'POST', 'GP', 'G', 'A', 'TP', 'PIM', '+/-']


In [11]:
table_rows = table.find('tbody').find_all('tr')

In [12]:
#get_text() works for nested elements
#.string only returns first level
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

table_df = pd.DataFrame(l, columns = col_names)

In [13]:
table_df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [14]:
#removing the duplicated columns
table_df = table_df.loc[:,~table_df.columns.duplicated()]

In [15]:
table_df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,
3,2017-18,Did not play,,,,,,,,|,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,
