In [None]:
import requests

#### Getting the html content of a website

http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html

In [None]:
res = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")

In [None]:
res.text

In [None]:
type(res.text)

#### Parsing html

In [None]:
# in case bs4 throws error try
# !pip install --upgrade html5lib==1.0b8

from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text, 'html.parser')

In [None]:
type(soup)

In [None]:
print(soup.prettify())

In [None]:
len(soup.find_all('p'))

In [None]:
soup.find_all('p')[1]

In [None]:
soup.find_all('p')[1].text

In [None]:
len(soup.find_all('p', class_='outer-text'))

In [None]:
soup.find_all('p', class_='inner-text')

In [None]:
len(soup.find_all('p', class_='inner-text'))

#### Finding the elements of the site

Since every web page is different and html can get very large and messy, the easiest way to find elements that you are interested in is to start from the browser window. So next we will quickly look at how to find elements using the developer tools in your browser. Open the following webpage in your browser (preferably Chrome): http://forecast.weather.gov/MapClick.php?lat=21.3049&lon=-157.8579#.Wkwh8VQ-fVo 

Find the developer tools in your browser. (In Chrome, it's view --> developer --> developer tools or Control+Shift+C on Windows and Command+Shift+C on Mac) You should end up with a panel at the bottom or the right side of the browser like what you see below. Make sure the Elements panel is highlighted:

In [None]:
res = requests.get("http://forecast.weather.gov/MapClick.php?lat=21.3049&lon=-157.8579")
soup = BeautifulSoup(res.text, 'html.parser')

In [None]:
soup

In [None]:
soup.find_all('p', class_="myforecast-current-lrg")

In [None]:
soup.find_all('p', class_="myforecast-current-lrg")[0]

In [None]:
type(soup.find_all('p', class_="myforecast-current-lrg")[0])

In [None]:
soup.find_all('p', class_="myforecast-current-lrg")[0].text

In [None]:
soup.find_all('p', class_="myforecast-current-sm")[0].text

#### Using dictionary for making queries and collecting response

In [None]:
latlon_dict = {
    'Honolulu':[21.3049, -157.8579],
    'Times Square':[40.757339, -73.985992],
    'Yosemite':[37.8651011, -119.5383294]
}
latlon_dict

In [None]:
import time

In [None]:
response_dict = {}
for place,coordinates in latlon_dict.items():
    url = "http://forecast.weather.gov/MapClick.php?lat={}&lon={}".format(
        coordinates[0], coordinates[1])
    print(place)
    print(url)
    resp = requests.get(url)
    time.sleep(3)
    soup = BeautifulSoup(resp.text, 'html.parser')
    temp_C = soup.find_all('p', class_="myforecast-current-sm")[0].text
    response_dict[place] = temp_C

In [None]:
response_dict

In [None]:
for place,temperature in response_dict.items():
    print("The current temperature in {} is {}.".format(place, temperature))

#### saving dictinaries

In [None]:
import numpy as np

In [None]:
np.save('mydict.npy', response_dict) 

In [None]:
read_dictionary = np.load('mydict.npy', allow_pickle=True).item()

In [None]:
read_dictionary

In [None]:
type(read_dictionary)

### 1 - exercise

We need the zip codes of the 5 landmarks in our data. Fortunatelly Google shows the zip codes at a fixed place if using the right searchphase. <br>
Open this link and using the Inspect tool in the browser try to find the class of the HTML element of a zip code shown at the top of the page! <br> 
https://www.google.com/search?q=San+Jose+zip+code

In [None]:
#div class="bVj5Zb FozYp"

### 1 - check yourself

The zip code is under a div of class "bVj5Zb FozYP"

### 2 - exercise
Now use the requests library to get the html content of this page and create a BeautifulSoup object called soup from this content. For avoiding getting the "Before you continue" page, use the argument cookies like this: <br><br>
cookies = {"CONSENT": "YES+cb.20210720-07-p0.en+FX+410"}<br>
googleresponse = requests.get("https://www.google.com/search?q=San+Jose+zip+code", cookies=cookies)

In [1]:
import requests
from bs4 import BeautifulSoup
headers = {'Cookie':'CONSENT=YES+cb.20210418-17-p0.it+FX+917;'}

res = requests.get('https://www.google.com/search?q=San+Jose+zip+code', headers = headers )
soup = BeautifulSoup(res.text, 'html.parser')
soup

<!DOCTYPE html>
<html lang="hu"><head><meta charset="utf-8"/><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/><title>San Jose zip code - Google-keresés</title><script nonce="S3A2vtcky7p/64vBJfnMaA==">(function(){
document.documentElement.addEventListener("submit",function(b){var a;if(a=b.target){var c=a.getAttribute("data-submitfalse");a="1"===c||"q"===c&&!a.elements.q.value?!0:!1}else a=!1;a&&(b.preventDefault(),b.stopPropagation())},!0);document.documentElement.addEventListener("click",function(b){var a;a:{for(a=b.target;a&&a!==document.documentElement;a=a.parentElement)if("A"===a.tagName){a="1"===a.getAttribute("data-nohref");break a}a=!1}a&&b.preventDefault()},!0);}).call(this);(function(){
var a=window.performance;window.start=Date.now();a:{var b=window;if(a){var c=a.timing;if(c){var d=c.navigationStart,f=c.responseStart;if(f>d&&f<=window.start){window.start=f;b.wsrt=f-d;break a}}a.now&&(b.wsrt=Math.floor(a.now()))}}window.google=windo

### 2 - check yourself

In [2]:
if type(soup) == BeautifulSoup and '94089' in soup.text:
    print('Your soup object is correct')
else:
    print('Your soup object is NOT correct')

Your soup object is correct


### 3 - exercise
Try to find all the div elements of class bVj5Zb FozYP in your soup object. How many are there?

In [3]:
elements = soup.find_all('div', class_='bVj5Zb FozYP')
len(elements) # 0 elements

0

### 3 - check yourself
If you haven't found any div of this class you were right.

### 4 - exercise
So it looks like that the scraped HTML code doesn't have the elements you saw in the browser. The reason is that when opening the url in the browser, it uses JavaScript to format the page, but when we scraped it, only the plaine HTML was sent. <br><br>
To see the same content in the browser disable JavaScript usage by following this directions:  https://productforums.google.com/forum/#!msg/chrome/BYOQskiuGU0/dO592rlLbJ0J). <br><br>
Then open the page again and find using the Inspect tool find the HTML elemnt containg the zip code!

In [None]:
# ('div',  class_='BNeawe deIvCb AP7Wnd')

### 4 - check yourself

Each zip code is under a div of class "BNeawe deIvCb AP7Wnd"

### 5 - exercise
Try to find all the div elements of class "BNeawe deIvCb AP7Wnd" in your soup object. How many are there?

In [6]:
headers = {'Cookie':'CONSENT=YES+cb.20210418-17-p0.it+FX+917;'}

res = requests.get('https://www.google.com/search?q=San+Jose+zip+code',headers = headers )
soup = BeautifulSoup(res.text, 'html.parser')

len(soup.find_all('div',  class_='BNeawe deIvCb AP7Wnd'))


68

### 5 - check yourself
You should find 68 elements

### 6 - exercise
Make a list called zipcode_list that contains the text from all the "BNeawe deIvCb AP7Wnd" div elements. Keep only those that consist of numbers

In [7]:
zipcode_list = []
for element in soup.find_all('div',  class_='BNeawe deIvCb AP7Wnd'):
    if element.text.isdigit():
        zipcode_list.append(element.text)


### 6 - check yourself

In [8]:
if sorted(zipcode_list)[0] == '94088' and len(zipcode_list) == 66:
    print('Your list is correct')
else:
    print('Your list is NOT correct')

Your list is correct


### 7 - exercise
Read in the weather csv into a pandas dataframe called station. <br>
Create a dictionary called zipcode_dict which keys are the unique values from the landmark column and the value of each key is an empty list. You print the unique values and create the dictionary by hand or as an advanced task, try to create the dictionary without typing any landmark name!

In [9]:
import pandas as pd
station = pd.read_csv('weather.csv')
landmarks = list(station['landmark'].unique())
zipcode_dict = dict.fromkeys(landmarks, [])

# Check
zipcode_dict.keys()


dict_keys(['San Francisco', 'Palo Alto', 'Mountain View', 'San Jose', 'Redwood City'])

### 7 - check yourself

In [10]:
if sorted(list(zipcode_dict.items())) == [('Mountain View', []),
                                         ('Palo Alto', []),
                                         ('Redwood City', []),
                                         ('San Francisco', []),
                                         ('San Jose', [])]:
    print('Your dictionary is correct')
else:
    print('Your dictionary is NOT correct')

Your dictionary is correct


### 8 - exercise
Loop the keys from the zipcode_dict and for each key print the url you would use to search the zip codes of a given city in google by using string formatting. <br>
For example if the city is Palo Alto the url should be: <br>
https://www.google.com/search?q=Palo Alto zip code

In [11]:
for city in list(zipcode_dict.keys()):
    url='https://www.google.com/search?q={} zip code'.format(city)
    print(url)


https://www.google.com/search?q=San Francisco zip code
https://www.google.com/search?q=Palo Alto zip code
https://www.google.com/search?q=Mountain View zip code
https://www.google.com/search?q=San Jose zip code
https://www.google.com/search?q=Redwood City zip code


### 9 - exercise
Similarly as before, loop the keys from the zipcode_dict and for each key inside the loop:
- Get a response object to the url you would use to search the zip codes of a given city in google. <br>
- Make a soup from that resopnse object. <br>
- Make a list of all zip codes in the soup object. You can find the zip codes as in Exercise 6<br>
- Assign this list as value to the key in the zipcode_dict

In [12]:
for city in list(zipcode_dict.keys()):
    url='https://www.google.com/search?q={} zip code'.format(city)
    res = requests.get(url, headers = headers)
    soup = BeautifulSoup(res.text, 'html.parser')
    zipcode_list = []
    for element in soup.find_all('div',  class_='BNeawe deIvCb AP7Wnd'):
        if element.text.isdigit():
            zipcode_list.append(element.text)
    zipcode_dict[city]=zipcode_list

In [14]:
# Check
zipcode_dict

{'San Francisco': ['94016',
  '94102',
  '94103',
  '94104',
  '94105',
  '94107',
  '94108',
  '94109',
  '94110',
  '94111',
  '94112',
  '94114',
  '94115',
  '94116',
  '94117',
  '94118',
  '94119',
  '94120',
  '94121',
  '94122',
  '94123',
  '94124',
  '94125',
  '94126',
  '94127',
  '94129',
  '94130',
  '94131',
  '94132',
  '94133',
  '94134',
  '94137',
  '94139',
  '94140',
  '94141',
  '94142',
  '94143',
  '94144',
  '94145',
  '94146',
  '94147',
  '94151',
  '94153',
  '94154',
  '94156',
  '94158',
  '94159',
  '94160',
  '94161',
  '94162',
  '94163',
  '94164',
  '94171',
  '94172',
  '94177',
  '94188'],
 'Palo Alto': ['94020',
  '94022',
  '94024',
  '94028',
  '94301',
  '94302',
  '94303',
  '94304',
  '94306',
  '95033'],
 'Mountain View': ['94039',
  '94040',
  '94041',
  '94042',
  '94043',
  '94085',
  '94303'],
 'San Jose': ['94088',
  '94089',
  '94560',
  '95002',
  '95008',
  '95013',
  '95035',
  '95037',
  '95050',
  '95054',
  '95101',
  '95103',
  '

### 9 - check yourslef

In [15]:
if sorted([len(x) for x in zipcode_dict.values()]) == [6, 7, 10, 56, 66]:
    print('Your dictionary is correct')
else:
    print('Your dictionary is NOT correct')

Your dictionary is correct
