# Web Parsing

April 2019

In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from bm_util import parse_table_from_url

### Billboard Top 100 Songs

In [2]:
def parse_billboard_100():
    """Parse billboard 100 into dataframe."""
    page=requests.get('https://www.billboard.com/charts/hot-100')
    soup=BeautifulSoup(page.content,'html.parser')
    chart = soup.find(class_="chart-details")
    items = chart.find_all(class_="chart-list-item")
#     print(items[0].prettify())
    top100 = {}

    for i in range(len(items)):
        song = re.sub(r"[^0-9a-zA-Z(): ]+","",items[i].find(class_="chart-list-item__title-text").get_text())
        artist = re.sub(r"[^0-9a-zA-Z(): ]+","",items[i].find(class_="chart-list-item__artist").get_text())
        lyrics = None
        try:
            url = items[i].find(class_="chart-list-item__lyrics").find('a')['href']
            page=requests.get(url)
            soup=BeautifulSoup(page.content,'html.parser')
            lyric_section = soup.find(class_="article__body js-fitvids-content")
            lyric_item = lyric_section.find_all(class_="lyrics")
            lyrics = lyric_item[0].get_text().split("\xa0")[0]
        except Exception as e:
            pass
        top100[i] = {"Artist":artist, "Song":song, "Lyrics":lyrics}

    top100 = pd.DataFrame(top100).T[['Artist','Song','Lyrics']]
    return top100

In [4]:
top100 = parse_billboard_100()
print(top100.shape)
top100.head()

(100, 3)


Unnamed: 0,Artist,Song,Lyrics
0,Ariana Grande,7 Rings,"Yeah, breakfast at Tiffany's and bottles of bu..."
1,Post Malone,Wow,"Post Malone's newest single ""Wow."" was release..."
2,Post Malone Swae Lee,Sunflower (SpiderMan: Into The SpiderVerse),"Ayy, ayy, ayy, ayy (ooh)Ooh, ooh, ooh, ohh (oo..."
3,Halsey,Without Me,Found you when your heart was brokeI filled yo...
4,Cardi B Bruno Mars,Please Me,"Uh, uh, yeah, come onPlease me, babyTurn aroun..."


In [20]:
top100["Lyrics"].iloc[0]

'Yeah, breakfast at Tiffany\'s and bottles of bubblesGirls with tattoos who like getting in troubleLashes and diamonds, ATM machinesBuy myself all of my favorite things (Yeah)Been through some bad shit, I should be a sad bitchWho woulda thought it\'d turn me to a savage?Rather be tied up with calls and not stringsWrite my own checks like I write what I sing, yeah (Yeah)My wrist, stop watchin\', my neck is flossin\'Make big deposits, my gloss is poppin\'You like my hair? Gee, thanks, just bought itI see it, I like it, I want it, I got it (Yeah)I want it, I got it, I want it, I got itI want it, I got it, I want it, I got itYou like my hair? Gee, thanks, just bought itI see it, I like it, I want it, I got it (Yeah)Wearing a ring, but ain\'t gon\' be no "Mrs."Bought matching diamonds for six of my bitchesI\'d rather spoil all my friends with my richesThink retail therapy my new addictionWhoever said money can\'t solve your problemsMust not have had enough money to solve \'emThey say, "Whic

### National Weather Service

Based on DataQuest tutorial [here](https://www.dataquest.io/blog/web-scraping-tutorial-python/).

In [8]:
page=requests.get('https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.XKkpwOtKjm0')
soup=BeautifulSoup(page.content,'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Tonight: Mostly cloudy, with a low around 54. West southwest wind 13 to 18 mph becoming south 6 to 11 mph after midnight. Winds could gust as high as 23 mph. " class="forecast-icon" src="newimages/medium/nbkn.png" title="Tonight: Mostly cloudy, with a low around 54. West southwest wind 13 to 18 mph becoming south 6 to 11 mph after midnight. Winds could gust as high as 23 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Cloudy
 </p>
 <p class="temp temp-low">
  Low: 54 °F
 </p>
</div>


In [9]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()
print(period, short_desc, temp)

Tonight Mostly Cloudy Low: 54 °F


In [10]:
img = tonight.find("img")
desc = img['title']
print(desc)

Tonight: Mostly cloudy, with a low around 54. West southwest wind 13 to 18 mph becoming south 6 to 11 mph after midnight. Winds could gust as high as 23 mph. 


In [11]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Tonight',
 'Sunday',
 'SundayNight',
 'Monday',
 'MondayNight',
 'Tuesday',
 'TuesdayNight',
 'Wednesday',
 'WednesdayNight']

In [12]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]
print(short_descs, temps, descs)

['Mostly Cloudy', 'DecreasingClouds', 'Mostly Cloudy', 'ChanceShowers', 'ChanceShowers', 'Mostly Sunny', 'Clear', 'Sunny', 'Mostly Clear'] ['Low: 54 °F', 'High: 68 °F', 'Low: 56 °F', 'High: 67 °F', 'Low: 52 °F', 'High: 64 °F', 'Low: 50 °F', 'High: 64 °F', 'Low: 49 °F'] ['Tonight: Mostly cloudy, with a low around 54. West southwest wind 13 to 18 mph becoming south 6 to 11 mph after midnight. Winds could gust as high as 23 mph. ', 'Sunday: Cloudy, then gradually becoming mostly sunny, with a high near 68. South southeast wind 6 to 13 mph becoming west in the afternoon. ', 'Sunday Night: Mostly cloudy, with a low around 56. West wind 7 to 13 mph. ', 'Monday: A 30 percent chance of showers after 11am.  Mostly cloudy, with a high near 67. Light and variable wind becoming west southwest 8 to 13 mph in the morning. ', 'Monday Night: A 50 percent chance of showers.  Mostly cloudy, with a low around 52. West northwest wind 13 to 15 mph, with gusts as high as 18 mph.  New precipitation amounts o

In [13]:
weather = pd.DataFrame({"period":periods, "short_desc":short_descs,"temp":temps,"desc":descs})
weather

Unnamed: 0,period,short_desc,temp,desc
0,Tonight,Mostly Cloudy,Low: 54 °F,"Tonight: Mostly cloudy, with a low around 54. ..."
1,Sunday,DecreasingClouds,High: 68 °F,"Sunday: Cloudy, then gradually becoming mostly..."
2,SundayNight,Mostly Cloudy,Low: 56 °F,"Sunday Night: Mostly cloudy, with a low around..."
3,Monday,ChanceShowers,High: 67 °F,Monday: A 30 percent chance of showers after 1...
4,MondayNight,ChanceShowers,Low: 52 °F,Monday Night: A 50 percent chance of showers. ...
5,Tuesday,Mostly Sunny,High: 64 °F,"Tuesday: Mostly sunny, with a high near 64."
6,TuesdayNight,Clear,Low: 50 °F,"Tuesday Night: Clear, with a low around 50."
7,Wednesday,Sunny,High: 64 °F,"Wednesday: Sunny, with a high near 64."
8,WednesdayNight,Mostly Clear,Low: 49 °F,"Wednesday Night: Mostly clear, with a low arou..."


In [14]:
temp_nums = weather["temp"].str.extract("(?P<temp_num>\d+)",expand=False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

0    54
1    68
2    56
3    67
4    52
5    64
6    50
7    64
8    49
Name: temp_num, dtype: object

In [15]:
weather["temp_num"].mean()

58.22222222222222

In [16]:
is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
is_night

0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
8     True
Name: temp, dtype: bool

In [17]:
weather[is_night]

Unnamed: 0,period,short_desc,temp,desc,temp_num,is_night
0,Tonight,Mostly Cloudy,Low: 54 °F,"Tonight: Mostly cloudy, with a low around 54. ...",54,True
2,SundayNight,Mostly Cloudy,Low: 56 °F,"Sunday Night: Mostly cloudy, with a low around...",56,True
4,MondayNight,ChanceShowers,Low: 52 °F,Monday Night: A 50 percent chance of showers. ...,52,True
6,TuesdayNight,Clear,Low: 50 °F,"Tuesday Night: Clear, with a low around 50.",50,True
8,WednesdayNight,Mostly Clear,Low: 49 °F,"Wednesday Night: Mostly clear, with a low arou...",49,True


### CoinMarketCap

Coins

In [18]:
coin = parse_table_from_url('https://coinmarketcap.com/all/views/all/')
coin = coin.iloc[:,1:6]
coin['CirculatingSupply'] = (pd.to_numeric(coin['CirculatingSupply']. \
                                      apply(lambda x: re.sub(r'[\W+]',"",x)), \
                                      errors='coerce').map('{:,.0f}'.format))
coin.head()

Unnamed: 0_level_0,Symbol,MarketCap,Price,CirculatingSupply,Volume24h
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,BTC,"$89,370,801,942",$5068.78,17631637,"$16,860,451,693"
2,ETH,"$17,519,882,358",$165.98,105552349,"$7,144,151,100"
3,XRP,"$14,745,247,317",$0.353232,41743765071,"$1,480,023,096"
4,LTC,"$5,660,723,072",$92.45,61227111,"$3,851,366,411"
5,BCH,"$5,405,941,927",$305.17,17714350,"$2,089,444,804"


Exchanges

In [19]:
exchange = parse_table_from_url('https://coinmarketcap.com/rankings/exchanges/')
exchange.drop('VolGraph7d',axis=1,inplace=True)
exchange.head()

Unnamed: 0_level_0,Name,AdjVol24h,Volume24h,Volume7d,Volume30d,NoMarkets,Change24h,Launched
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,DigiFinex,"$1,543,920,332","$1,543,920,332","$11,196,613,859","$26,197,484,896",164,0.40%,Apr 2018
2,OKEx,"$1,538,333,814","$1,538,333,814","$12,195,123,799","$27,374,946,424",379,12.17%,Jan 2014
3,Binance,"$1,305,593,668","$1,305,593,668","$11,044,970,718","$31,977,026,306",463,-3.13%,Jul 2017
4,BitForex,"$1,262,299,350","$1,262,299,350","$8,188,482,182","$22,613,525,923",181,19.56%,Jun 2018
5,HitBTC,"$1,214,801,686","$1,214,801,686","$7,482,134,895","$21,883,797,199",790,-13.69%,Feb 2014


### TCG Player

In [None]:
df = parse_table_from_url("https://shop.tcgplayer.com/price-guide/magic/ice-age")
df.drop(["PriceTrend","Unnamed7"],axis=1,inplace=True)
df.head()