# A web scraping demonstration using Python

In [None]:
# a list stores a collection of objects
x = ["one", "two"]
print(x)

In [None]:
# the append function adds an object to the list
x.append("three")
print(x)

In [None]:
# the 'split' function is used to split strings into a list of smaller components
sentence = "Hello, how are you?"
words = sentence.split(" ")
print(words)

In [None]:
# a 'for loop' is used to iterate over each item in a collection 
num = 1
for w in words :
    print("word #", num, ":", w)
    num = num + 1

In [None]:
# We 'scrape' a web page by downloading the underlying the html file, and parsing its contents
# We will scrape information from Weather.com

from bs4 import BeautifulSoup
import requests

# get the page
url = "https://weather.com/weather/tenday/l/06226:4:US"
page = requests.get(url)

# parse the page
soup = BeautifulSoup(page.content, 'html.parser')

# find and display the table
table = soup.find("table")
print(table.prettify())


In [None]:
# look at the text of the second row, note that the high/low temp column has a class named 'temp'
rows = table.find_all("tr")
columns = rows[1].find_all("td")
for c in columns :
    print(c)
    print()


In [None]:
hilow = table.find_all("td", {"class": "temp"})
hi_temps = []
for temps in hilow :
    t = temps.text.split("°")
    if len(t) != 3 :
        hi_temps.append(None)
    else :
        hi_temps.append(int(t[0]))
        
print(hi_temps)
    

In [None]:

descriptions = table.find_all("td", {"class": "description"})
colors = []
for d in descriptions :
    print(d.text)
    print()
    if "Sunny" in d.text :
        colors.append("orange")
    elif "Showers" in d.text :
        colors.append("blue")
    else :
        colors.append("gray")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
 
# we need to remove missing values for plotting
hList = []
cList = []
for h,c in zip(hi_temps, colors) :
    if h!=None:
        hList.append(h)
        cList.append(c)
        
hi_temps = hList
colors = cList

y_pos = np.arange(len(hi_temps))
 
plt.bar(y_pos, hi_temps, align='center', alpha=0.5, color = colors)
plt.xticks(y_pos, range(1,16))
plt.ylabel('High temperature')
plt.title('15 Day Weather Forecast')
plt.show()

In [None]:
# define a function to get high temps
# We 'scrape' a web page by downloading the underlying HTML, and parsing its contents
# We will look at scraping information from Weather.com

from collections import namedtuple

def getHighTemps(zip) :

    url = "https://weather.com/weather/tenday/l/" + zip + ":4:US"
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    table = soup.find("table")

    hilow = table.find_all("td", {"class": "temp"})
    hi_temps = []
    for temps in hilow :
        t = temps.text.split("°")
        if len(t) != 3 :
            hi_temps.append(None)
        else :
            hi_temps.append(int(t[0]))


    descriptions = table.find_all("td", {"class": "description"})
    colors = []
    for d in descriptions :        
        if "Sunny" in d.text :
            colors.append("orange")
        elif "Showers" in d.text :
            colors.append("blue")
        else :
            colors.append("gray")
  
    results = namedtuple('results', 'highs colors')
    r = results(hi_temps, colors)
    return r


willimantic = getHighTemps("06226")
print(willimantic.highs)



In [None]:
# get the data to plot
willi = getHighTemps("06226")
mb = getHighTemps("33109") 

# create plot
days = range(1,16)

# plot Willimantic data
plt.scatter(days, willi.highs, c = willi.colors)
plt.plot(days, willi.highs, label = "Willimantic")

# plot Miami Beach data
plt.scatter(days, mb.highs, c = mb.colors)
plt.plot(days, mb.highs, label = "Miami Beach")

# Add labels and show plot
plt.xlabel("Day")
plt.ylabel("High Temperature")
plt.legend()
plt.show()

In [None]:
# Note: because the wordcloud package is not available online, running this code will generate an error
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# get Willimantic weather descriptions (the text only) and store as a single string
descriptions = table.find_all("td", {"class": "description"})
descriptions = [d.text for d in descriptions]
descriptions = " ".join(descriptions)

#generate the word cloud
wordcloud = WordCloud(
        background_color = 'black',
        stopwords = STOPWORDS.add('RT')
).generate(descriptions)

plt.imshow(wordcloud)
plt.show()
