# intro to webscraping one page - basic hmtl and css 

## Get tools 

In [1]:
import requests 
from bs4 import BeautifulSoup

### get data - normally we would be running a request but this time you are copy pasting 

In [2]:
html_doc= """
<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</html>
"""

In [71]:
soup=BeautifulSoup(html_doc,'html.parser')

In [72]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



## html to select elements of the code 

In [5]:
# select by class tag 
soup.title

<title>The Dormouse's story</title>

In [6]:
#select by class tag 
soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

In [75]:
#find_all 
a_tags=soup.find_all("a")

In [77]:
a_tags

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [76]:
#html tag with a sub select clause - key value pair
a_tags_1=soup.find_all("a",{"id":"link1"})

In [78]:
a_tags_1

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [15]:
type(a_tags)

bs4.element.ResultSet

In [14]:
#get_text() 
for a in a_tags:
    print(a.get_text())

Elsie
Lacie
Tillie


In [16]:
#get('href')
for a in a_tags:
    print(a.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [17]:
soup.title

<title>The Dormouse's story</title>

In [18]:
# parent hierarchy 
soup.title.parent.string

"The Dormouse's story"

In [21]:
soup.title.parent.name

'head'

In [24]:
soup.title

<title>The Dormouse's story</title>

In [26]:
# text.count
soup.text.count('were')

2

In [28]:
# simple website query with regex - find 1 or more words matching this string 
import re 
re.findall(r'\w+', requests.get('https://www.ironhack.com/en').text).count('bootcamp')

63

In [198]:
re.findall(r'\w+', requests.get('https://www.ironhack.com/en').text).count('barcelona')

36

##  css method 

In [32]:
soup


<!DOCTYPE html>

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [31]:
soup.select('#link1') # # for id

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [33]:
soup.select('.sister') # . for class name

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [34]:
soup.select('a') # for class type 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [35]:
# iterate through select of a and get text 
for a in soup.select('a'):
    print(a.get_text())

Elsie
Lacie
Tillie


In [41]:
#use index to find position in results 
print(soup.select('a')[2].get_text())

Tillie


In [50]:
soup.select('p.story')

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

## activity 

- 1) all the 'fun facts'
- 2) names of all the places 
- 3) the content (name plus fact) of ONLY cities
- 4) the names of only cities (not facts) 

In [52]:
geography = """
<!DOCTYPE html>
<html>
<head> Geography</head>
<body>

<div class="city">
  <h2>London</h2>
  <p>London is the most popular tourist destination in the world.</p>
</div>

<div class="city">
  <h2>Paris</h2>
  <p>Paris was originally a Roman City called Lutetia.</p>
</div>

<div class="country">
  <h2>Spain</h2>
  <p>Spain produces 43,8% of all the world's Olive Oil.</p>
</div>

</body>
</html>
"""

In [57]:
soup=BeautifulSoup(geography,'html.parser')

### 1) all the fun facts 

### 2) names of all places 

### 3) name and fact for cities only 

In [61]:
#with html 
soup.find_all("div",{"class":"city"})

[<div class="city">
 <h2>London</h2>
 <p>London is the most popular tourist destination in the world.</p>
 </div>,
 <div class="city">
 <h2>Paris</h2>
 <p>Paris was originally a Roman City called Lutetia.</p>
 </div>]

In [68]:
for i in soup.find_all("div",{"class":"city"}):
    print(i.get_text())


London
London is the most popular tourist destination in the world.


Paris
Paris was originally a Roman City called Lutetia.



In [66]:
#with css 
print(soup.select('div.city'))

[<div class="city">
<h2>London</h2>
<p>London is the most popular tourist destination in the world.</p>
</div>, <div class="city">
<h2>Paris</h2>
<p>Paris was originally a Roman City called Lutetia.</p>
</div>]


In [67]:
for i in soup.select('div.city'):
    print(i.get_text())


London
London is the most popular tourist destination in the world.


Paris
Paris was originally a Roman City called Lutetia.



### 4) name for cities only 

In [70]:
# with html we can use h2 as our tag for name 
for i in soup.find_all("div",{"class":"city"}):
    print(i.h2.get_text())

London
Paris


## Imdb 250 scraping 

In [79]:
url="https://www.imdb.com/chart/top/"

In [91]:
results=requests.get(url, headers = {"Accept-Language": "en-US"})
# locale of IP - set to english lang 

In [92]:
results.status_code

200

In [93]:
results.text

'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    \n    \n    \n\n    \n    \n    \n\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>Top 250 Movies - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n      uet("be", "LoadTitle", {wb: 1});\n    }\n</script>\n<script>\n    if (typeof uex == \'function\') {\n      uex("ld", "LoadTitle", {wb: 1});\n    }\n<

In [94]:
soup=BeautifulSoup(results.content,'html.parser')

In [111]:
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Top 250 Movies - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/chart/top/" rel="canonical"/>
  <meta content="http://w

In [96]:
# get the title / relevant content 
soup.select("td.titleColumn")

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>,
 <td class="titleColumn">
       2.
       <a href="/title/tt0068646/" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">The Godfather</a>
 <span class="secondaryInfo">(1972)</span>
 </td>,
 <td class="titleColumn">
       3.
       <a href="/title/tt0071562/" title="Francis Ford Coppola (dir.), Al Pacino, Robert De Niro">The Godfather: Part II</a>
 <span class="secondaryInfo">(1974)</span>
 </td>,
 <td class="titleColumn">
       4.
       <a href="/title/tt0468569/" title="Christopher Nolan (dir.), Christian Bale, Heath Ledger">The Dark Knight</a>
 <span class="secondaryInfo">(2008)</span>
 </td>,
 <td class="titleColumn">
       5.
       <a href="/title/tt0050083/" title="Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb">12 Angry Men</a>
 <span class="secondaryInfo">

In [109]:
#get the title 
soup.select("td.titleColumn a")[0].text

'The Shawshank Redemption'

In [None]:



#main > div > span > div > div > div.lister > table > tbody > tr:nth-child(225) > td.titleColumn > a

#main > div > span > div > div > div.lister > table > tbody > tr:nth-child(91) > td.titleColumn > a

In [110]:
# get the dir stars 
soup.select("td.titleColumn a")[0]['title']

'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'

In [127]:
# get the rank 
#main > div > span > div > div > div.lister > table > tbody > tr:nth-child(225) > td.titleColumn

soup.select("td.titleColumn \n ")[0].text


'\n      1.\n      The Shawshank Redemption\n(1994)\n'

In [121]:
# get the year

soup.select("td.titleColumn span.secondaryInfo")[0].text

'(1994)'

### For loop to collect movie title, dirs and actors, year 

In [140]:
title = []
year = [] 
dir_actor= []
len_movies=len(soup.select("td.titleColumn"))

In [141]:
len_movies

250

In [142]:
for i in range(len_movies):
    title.append(soup.select("td.titleColumn a")[i].text)
    year.append(soup.select("td.titleColumn span.secondaryInfo")[i].text)
    dir_actor.append(soup.select("td.titleColumn a")[i]['title'])

In [145]:
len(title)

250

In [143]:
len(year)

250

In [185]:
len(dir_actor)

250

In [186]:
dir_actor

['Frank Darabont (dir.), Tim Robbins, Morgan Freeman',
 'Francis Ford Coppola (dir.), Marlon Brando, Al Pacino',
 'Francis Ford Coppola (dir.), Al Pacino, Robert De Niro',
 'Christopher Nolan (dir.), Christian Bale, Heath Ledger',
 'Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb',
 'Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes',
 'Peter Jackson (dir.), Elijah Wood, Viggo Mortensen',
 'Quentin Tarantino (dir.), John Travolta, Uma Thurman',
 'Sergio Leone (dir.), Clint Eastwood, Eli Wallach',
 'Peter Jackson (dir.), Elijah Wood, Ian McKellen',
 'David Fincher (dir.), Brad Pitt, Edward Norton',
 'Robert Zemeckis (dir.), Tom Hanks, Robin Wright',
 'Christopher Nolan (dir.), Leonardo DiCaprio, Joseph Gordon-Levitt',
 'Peter Jackson (dir.), Elijah Wood, Ian McKellen',
 'Irvin Kershner (dir.), Mark Hamill, Harrison Ford',
 'Lana Wachowski (dir.), Keanu Reeves, Laurence Fishburne',
 'Martin Scorsese (dir.), Robert De Niro, Ray Liotta',
 'Milos Forman (dir.), Jack Nicholson, Louise Fletch

In [None]:
# hint : install tqdm.noteboook - we will see this later 

## Create data frame from movies data 

we will also need to clean the data coming in 

In [138]:
import pandas as pd 

### clean steps - drop () from year, separate dirs from actors 

In [148]:
year_cl=[yr.strip(')').strip('(') for yr in year] #list comprehension instead of for loop

In [187]:
#for loop to split out directors from actors and clean the data slightly 
director=[]
star1=[]
star2=[]
for movie in dir_actor:
    split_list=movie.split(", ")# note we want to drop any preceding spaces from the names
    director.append(split_list[0].replace(" (dir.)",""))
    star1.append(split_list[1])
    star2.append(split_list[2])

### data frame creation 

In [188]:
movies= pd.DataFrame({"movie_title":title,"director":director,
                      "actor_1":star1,"actor_2":star2})

In [189]:
movies

Unnamed: 0,movie_title,director,actor_1,actor_2
0,The Shawshank Redemption,Frank Darabont,Tim Robbins,Morgan Freeman
1,The Godfather,Francis Ford Coppola,Marlon Brando,Al Pacino
2,The Godfather: Part II,Francis Ford Coppola,Al Pacino,Robert De Niro
3,The Dark Knight,Christopher Nolan,Christian Bale,Heath Ledger
4,12 Angry Men,Sidney Lumet,Henry Fonda,Lee J. Cobb
...,...,...,...,...
245,Miracle in Cell No. 7,Mehmet Ada Öztekin,Aras Bulut Iynemli,Nisa Sofiya Aksongur
246,Hera Pheri,Priyadarshan,Akshay Kumar,Suniel Shetty
247,Neon Genesis Evangelion: The End of Evangelion,Hideaki Anno,Megumi Ogata,Megumi Hayashibara
248,The Battle of Algiers,Gillo Pontecorvo,Brahim Hadjadj,Jean Martin


In [190]:
movies['movie_rank'] = np.arange(len(movies))+1

In [191]:
movies.head()

Unnamed: 0,movie_title,director,actor_1,actor_2,movie_rank
0,The Shawshank Redemption,Frank Darabont,Tim Robbins,Morgan Freeman,1
1,The Godfather,Francis Ford Coppola,Marlon Brando,Al Pacino,2
2,The Godfather: Part II,Francis Ford Coppola,Al Pacino,Robert De Niro,3
3,The Dark Knight,Christopher Nolan,Christian Bale,Heath Ledger,4
4,12 Angry Men,Sidney Lumet,Henry Fonda,Lee J. Cobb,5


# Day 2 scrape multiple pages 

In [1]:
from bs4 import BeautifulSoup 
import requests 
import pandas as pd 
from tqdm.notebook import tqdm 
from time import sleep 
from random import randint 

# example 1 imdb pagination 

In [2]:
# start with website and we will use the second page 
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=51&ref_=adv_prv"

In [3]:
results=requests.get(url)
results.status_code

200

In [4]:
soup=BeautifulSoup(results.content,'html.parser')

In [5]:
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Feature Film,
Released between 1990-01-01 and 1995-01-01,
User Rating at least 7
(Sorted by Popularity Ascending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  <

## Build an iterator , create list of urls 

In [6]:
iterations=range(1,2283,50)

In [8]:
for i in iterations:
    start_at=str(i)
    url='https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start='+start_at+'&ref_=adv_prv'
    print(url)

https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=1&ref_=adv_prv
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=51&ref_=adv_prv
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=101&ref_=adv_prv
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=151&ref_=adv_prv
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=201&ref_=adv_prv
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=251&ref_=adv_prv
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start=301&ref_=adv_prv
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating

## Respectful scraping - use sleep and randint to separate our scraper pings 

In [9]:
#example 
for i in range(5):
    print(i)
    sleep(3)

0
1
2
3
4


In [10]:
# example 2 

for i in range(5):
    print(i)
    wait_time=randint(1,4)
    print('i will sleep now for'+str(wait_time)+'seconds')
    sleep(wait_time)

0
i will sleep now for2seconds
1
i will sleep now for2seconds
2
i will sleep now for2seconds
3
i will sleep now for4seconds
4
i will sleep now for1seconds


## assemble the scraper 

In [12]:
pages=[]
for i in iterations:
    start_at=str(i)
    url='https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1995-01-01&user_rating=7.0,&start='+start_at+'&ref_=adv_prv'
    response=requests.get(url)
    #just for monitoring 
    print("status="+str(response.status_code))
    pages.append(response)
    # take a short sleep 
    wait_time=randint(1,4)
    print("i will sleep now for"+str(wait_time)+' seconds')
    sleep(wait_time)  

status=200
i will sleep now for4 seconds
status=200
i will sleep now for2 seconds
status=200
i will sleep now for4 seconds
status=200
i will sleep now for4 seconds
status=200
i will sleep now for1 seconds
status=200
i will sleep now for4 seconds
status=200
i will sleep now for1 seconds
status=200
i will sleep now for4 seconds
status=200
i will sleep now for2 seconds
status=200
i will sleep now for2 seconds
status=200
i will sleep now for1 seconds
status=200
i will sleep now for4 seconds
status=200
i will sleep now for4 seconds
status=200
i will sleep now for2 seconds
status=200
i will sleep now for1 seconds
status=200
i will sleep now for1 seconds
status=200
i will sleep now for4 seconds
status=200
i will sleep now for2 seconds
status=200
i will sleep now for3 seconds
status=200
i will sleep now for3 seconds
status=200
i will sleep now for3 seconds
status=200
i will sleep now for2 seconds
status=200
i will sleep now for3 seconds
status=200
i will sleep now for2 seconds
status=200
i wil

In [13]:
# create the soup from one page
soup=BeautifulSoup(pages[0].content,'html.parser')

In [14]:
len(pages)

46

## title, synopsis 

In [19]:
# title 
# #main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content > h3 > a
# #main > div > div.lister.list.detail.sub-list > div > div:nth-child(2) > div.lister-item-content > h3 > a

soup.select("h3 > a")

[<a href="/title/tt0111161/">Cadena perpetua</a>,
 <a href="/title/tt0110912/">Pulp Fiction</a>,
 <a href="/title/tt0107290/">Jurassic Park (Parque Jurásico)</a>,
 <a href="/title/tt0108052/">La lista de Schindler</a>,
 <a href="/title/tt0108358/">Tombstone: La leyenda de Wyatt Earp</a>,
 <a href="/title/tt0102926/">El silencio de los corderos</a>,
 <a href="/title/tt0109830/">Forrest Gump</a>,
 <a href="/title/tt0099685/">Uno de los nuestros</a>,
 <a href="/title/tt0110413/">El profesional (Léon)</a>,
 <a href="/title/tt0106677/">Movida del 76</a>,
 <a href="/title/tt0110357/">El rey león</a>,
 <a href="/title/tt0103064/">Terminator 2: El juicio final</a>,
 <a href="/title/tt0106611/">Elegidos para el triunfo</a>,
 <a href="/title/tt0103776/">Batman vuelve</a>,
 <a href="/title/tt0108399/">Amor a quemarropa</a>,
 <a href="/title/tt0105236/">Reservoir Dogs</a>,
 <a href="/title/tt0099785/">Solo en casa</a>,
 <a href="/title/tt0103874/">Drácula de Bram Stoker</a>,
 <a href="/title/tt009

In [21]:
# synopsis 
#main > div > div.lister.list.detail.sub-list > div > div:nth-child(2) > div.lister-item-content > p:nth-child(4)
##main > div > div.lister.list.detail.sub-list > div > div:nth-child(3) > div.lister-item-content > p:nth-child(4)
soup.select("p:nth-child(4)")


[<p class="text-muted">
 Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.</p>,
 <p class="text-muted">
 The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.</p>,
 <p class="text-muted">
 A pragmatic paleontologist touring an almost complete theme park on an island in Central America is tasked with protecting a couple of kids after a power failure causes the park's cloned dinosaurs to run loose.</p>,
 <p class="text-muted">
 In German-occupied Poland during World War II, industrialist <a href="/name/nm0771861">Oskar Schindler</a> gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis.</p>,
 <p class="text-muted">
 A successful lawman's plans to retire anonymously in Tombstone, Arizona are disrupted by the kind of outlaws he was famous for eliminating.</p>,
 <p class="text-muted">
 A 

## final iterator 
- each page responses from the urls 
- then parse each page
- capture the block of relevant text for each page 
- then for each block of code - get the title, and synopsis 
- clean as needed, make into data frame 

In [23]:
pages_parsed=[]
synopsis=[]
title=[]

for i in tqdm(range(len(pages))):
              pages_parsed.append(BeautifulSoup(pages[i].content,'html.parser'))
              movies_html=pages_parsed[i].select("div.lister-item-content")
              for a in range(len(movies_html)):
                  title.append(movies_html[a].select("h3 > a")[0].get_text())
                  synopsis.append(movies_html[a].select("p:nth-child(4)")[0].get_text().strip())

  0%|          | 0/46 [00:00<?, ?it/s]

In [27]:
print(len(synopsis))

2283


In [28]:
movies=pd.DataFrame({'title':title,'synopsis':synopsis})

In [73]:
movies.head(10)

Unnamed: 0,title,synopsis
0,Cadena perpetua,Two imprisoned men bond over a number of years...
1,Pulp Fiction,"The lives of two mob hitmen, a boxer, a gangst..."
2,Jurassic Park (Parque Jurásico),A pragmatic paleontologist touring an almost c...
3,La lista de Schindler,"In German-occupied Poland during World War II,..."
4,Tombstone: La leyenda de Wyatt Earp,A successful lawman's plans to retire anonymou...
5,El silencio de los corderos,A young F.B.I. cadet must receive the help of ...
6,Forrest Gump,"The presidencies of Kennedy and Johnson, the V..."
7,Uno de los nuestros,The story of Henry Hill and his life in the mo...
8,El profesional (Léon),12-year-old Mathilda is reluctantly taken in b...
9,Movida del 76,The adventures of high school and junior high ...


# Example - wikipedia - presidents 

In [30]:
url="https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States"

In [31]:
# get the first page -then extract the hrefs for the subpages 
response=requests.get(url)
response.status_code

200

In [32]:
soup=BeautifulSoup(response.content,'html.parser')

In [35]:
presidentslist=soup.select("td:nth-child(3) > b > a")

In [41]:
soup.select("td:nth-child(3) > b > a")

[<a href="/wiki/George_Washington" title="George Washington">George Washington</a>,
 <a href="/wiki/John_Adams" title="John Adams">John Adams</a>,
 <a href="/wiki/Thomas_Jefferson" title="Thomas Jefferson">Thomas Jefferson</a>,
 <a href="/wiki/James_Madison" title="James Madison">James Madison</a>,
 <a href="/wiki/James_Monroe" title="James Monroe">James Monroe</a>,
 <a href="/wiki/John_Quincy_Adams" title="John Quincy Adams">John Quincy Adams</a>,
 <a href="/wiki/Andrew_Jackson" title="Andrew Jackson">Andrew Jackson</a>,
 <a href="/wiki/Martin_Van_Buren" title="Martin Van Buren">Martin Van Buren</a>,
 <a href="/wiki/William_Henry_Harrison" title="William Henry Harrison">William Henry Harrison</a>,
 <a href="/wiki/John_Tyler" title="John Tyler">John Tyler</a>,
 <a href="/wiki/James_K._Polk" title="James K. Polk">James K. Polk</a>,
 <a href="/wiki/Zachary_Taylor" title="Zachary Taylor">Zachary Taylor</a>,
 <a href="/wiki/Millard_Fillmore" title="Millard Fillmore">Millard Fillmore</a>,
 

In [40]:
soup.select("td:nth-child(3) > b > a")[0]['href']

'/wiki/George_Washington'

In [None]:
# copy selector 
#mw-content-text > div.mw-parser-output > table.wikitable.sortable.jquery-tablesorter > tbody > tr:nth-child(1) > td:nth-child(3) > b > a
#mw-content-text > div.mw-parser-output > table.wikitable.sortable.jquery-tablesorter > tbody > tr:nth-child(81) > td:nth-child(3) > b > a

In [36]:
len(presidentslist)

46

In [47]:
presi_soup=[]
for p in presidentslist:
    url='https://en.wikipedia.org/'+p["href"]
    response=requests.get(url)
    print(p.get_text(),response.status_code)
    soup=BeautifulSoup(response.content, 'html.parser')
    presi_soup.append(soup.find("table",{"class":"infobox vcard"})) #html not css-space not . 
    wait_time=randint(1,3)
    print("I am sleeping now for "+str(wait_time))
    sleep(wait_time)

George Washington 200
I am sleeping now for 2
John Adams 200
I am sleeping now for 1
Thomas Jefferson 200
I am sleeping now for 3
James Madison 200
I am sleeping now for 1
James Monroe 200
I am sleeping now for 3
John Quincy Adams 200
I am sleeping now for 3
Andrew Jackson 200
I am sleeping now for 1
Martin Van Buren 200
I am sleeping now for 2
William Henry Harrison 200
I am sleeping now for 3
John Tyler 200
I am sleeping now for 1
James K. Polk 200
I am sleeping now for 1
Zachary Taylor 200
I am sleeping now for 2
Millard Fillmore 200
I am sleeping now for 2
Franklin Pierce 200
I am sleeping now for 2
James Buchanan 200
I am sleeping now for 3
Abraham Lincoln 200
I am sleeping now for 2
Andrew Johnson 200
I am sleeping now for 3
Ulysses S. Grant 200
I am sleeping now for 2
Rutherford B. Hayes 200
I am sleeping now for 1
James A. Garfield 200
I am sleeping now for 1
Chester A. Arthur 200
I am sleeping now for 2
Grover Cleveland 200
I am sleeping now for 1
Benjamin Harrison 200
I am sl

In [None]:
# #mw-content-text > div.mw-parser-output > table.infobox.vcard

In [44]:
len(presi_soup)

46

In [48]:
presi_soup

[<table class="infobox vcard"><tbody><tr><th class="infobox-above" colspan="2" style="font-size: 100%;"><div class="fn" style="font-size:125%;">George Washington</div></th></tr><tr><td class="infobox-image" colspan="2"><a class="image" href="/wiki/File:Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg"><img alt="Head and shoulders portrait of George Washington" data-file-height="5615" data-file-width="4626" decoding="async" height="267" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg/220px-Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg/330px-Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Gilbert_Stuart_Williamstown_Portrait_of_George_Washington.jpg/440px-Gilbert_Stuart_Williamstown_Portrait_of_Geo

### get information from the table / soup of results 

In [56]:
# birthday 
presi_soup[45].find("span",{"class":"bday"}).get_text()

'1942-11-20'

In [62]:
# political party 
presi_soup[45].find("th",string="Political party").parent.find("a").get_text()

'Democratic'

In [66]:
# no of sons/ daughters 
len(presi_soup[45].find("th", string="Children").parent.find_all("li"))

4

In [70]:
#task - complete the data and create dataframe 

president_name=[]
birth_date=[]
party=[]
no_children=[]

for presi in presi_soup:
    birth_date.append(presi.find("span", {"class":"bday"}).get_text())
    party.append(presi.find("th", string="Political party").parent.find("a").get_text())
    president_name.append(presi.find("div",{"class":"fn"}).get_text())
    try:
        no_children.append(len(presi.find("th", string="Children").parent.find_all("li")))
    except:
        no_children.append("NA")

presis_df = pd.DataFrame({"name":president_name,
                          "birthday":birth_date,
                          "party": party,
                          "numberofchildren": no_children})

In [68]:
presis_df

Unnamed: 0,name,birthday,party,numberofchildren
0,George Washington,1732-02-22,Independent,0.0
1,John Adams,1735-10-30,Pro-Administration,0.0
2,Thomas Jefferson,1743-04-13,Democratic-Republican,6.0
3,James Madison,1751-03-16,Democratic-Republican,
4,James Monroe,1758-04-28,Democratic-Republican,0.0
5,John Quincy Adams,1767-07-11,Whig,4.0
6,Andrew Jackson,1767-03-15,Democratic-Republican,3.0
7,Martin Van Buren,1782-12-05,Democratic-Republican,0.0
8,William Henry Harrison,1773-02-09,Democratic-Republican,0.0
9,John Tyler,1790-03-29,Independent,0.0


# APIs -the easier way to getdata 

In [111]:
#hockey api 
hockey=requests.get('https://v1.hockey.api-sports.io/')
print("hockey: ",hockey.status_code)

hockey:  200


In [114]:
hockey.content
# need a key to run queries on this api 

b'{"get":"notFound","parameters":[],"errors":{"time":"2022-03-01T12:10:27+00:00","endpoint":"This endpoint do not exist."},"results":0,"response":[]}'

In [128]:
#github api 

github=requests.get('https://api.github.com/users/siandav/events')
github.status_code

200

In [129]:
github.json()

[{'id': '20500663678',
  'type': 'PushEvent',
  'actor': {'id': 71644535,
   'login': 'siandav',
   'display_login': 'siandav',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/siandav',
   'avatar_url': 'https://avatars.githubusercontent.com/u/71644535?'},
  'repo': {'id': 446439899,
   'name': 'student-IH-labs-and-stuff/BCNDATA0122',
   'url': 'https://api.github.com/repos/student-IH-labs-and-stuff/BCNDATA0122'},
  'payload': {'push_id': 9224814933,
   'size': 1,
   'distinct_size': 1,
   'ref': 'refs/heads/main',
   'head': 'b0ce11ede8444fd3dc826a2f40d4262b47fdf2bd',
   'before': 'cff328fcb605d3b705823e6fdea302f8f32a6d0d',
   'commits': [{'sha': 'b0ce11ede8444fd3dc826a2f40d4262b47fdf2bd',
     'author': {'email': 'sian.davies@ironhack.com', 'name': 'siandav'},
     'message': 'Update .gitignore',
     'distinct': True,
     'url': 'https://api.github.com/repos/student-IH-labs-and-stuff/BCNDATA0122/commits/b0ce11ede8444fd3dc826a2f40d4262b47fdf2bd'}]},
  'public': True,
 

In [130]:
# for specific repo 
github=requests.get('https://api.github.com/repos/siandav/data-prework/events')
github.status_code

200

In [131]:
github.json()

[{'id': '20516525755',
  'type': 'PushEvent',
  'actor': {'id': 71644535,
   'login': 'siandav',
   'display_login': 'siandav',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/siandav',
   'avatar_url': 'https://avatars.githubusercontent.com/u/71644535?'},
  'repo': {'id': 298620226,
   'name': 'siandav/data-prework',
   'url': 'https://api.github.com/repos/siandav/data-prework'},
  'payload': {'push_id': 9232635038,
   'size': 1,
   'distinct_size': 1,
   'ref': 'refs/heads/master',
   'head': '25692906a0cca811d2287a038a38c24bfd6f3424',
   'before': '6a62ea5b3efb2cd23a058381f619020c24c62fe3',
   'commits': [{'sha': '25692906a0cca811d2287a038a38c24bfd6f3424',
     'author': {'email': '71644535+siandav@users.noreply.github.com',
      'name': 'sian d'},
     'message': 'Add files via upload',
     'distinct': True,
     'url': 'https://api.github.com/repos/siandav/data-prework/commits/25692906a0cca811d2287a038a38c24bfd6f3424'}]},
  'public': True,
  'created_at': '2022-03-

In [137]:
# iss -where is it now 
iss_where=requests.get("http://api.open-notify.org/iss-now.json")

In [138]:
iss_where.status_code

200

In [139]:
iss_where.json()

{'iss_position': {'longitude': '-42.6859', 'latitude': '28.1725'},
 'message': 'success',
 'timestamp': 1646137935}

In [153]:
latlon =(28.1725, -42.6859)

In [136]:
import folium

In [154]:
map=folium.Map(location = latlon, zoom_start=5)

In [155]:
map

In [91]:
#iss future passes overhead - my house in england 
coordinates = {"lat":52.063179,"lon":-0.476096,"n":20} #lat long for my house, 20 passes overhead
iss_future_passes=requests.get("http://api.open-notify.org/iss-pass.json", params=coordinates)
iss_future_passes.status_code

200

In [92]:
response.json()

{'message': 'success',
 'request': {'altitude': 100,
  'datetime': 1646134024,
  'latitude': 52.063179,
  'longitude': -0.476096,
  'passes': 20},
 'response': [{'duration': 250, 'risetime': 1646180995},
  {'duration': 593, 'risetime': 1646186563},
  {'duration': 652, 'risetime': 1646192313},
  {'duration': 657, 'risetime': 1646198114},
  {'duration': 644, 'risetime': 1646203919},
  {'duration': 545, 'risetime': 1646209738},
  {'duration': 552, 'risetime': 1646270118},
  {'duration': 647, 'risetime': 1646275841},
  {'duration': 657, 'risetime': 1646281635},
  {'duration': 651, 'risetime': 1646287441},
  {'duration': 587, 'risetime': 1646293251},
  {'duration': 222, 'risetime': 1646299172},
  {'duration': 492, 'risetime': 1646353684},
  {'duration': 636, 'risetime': 1646359373},
  {'duration': 657, 'risetime': 1646365156},
  {'duration': 655, 'risetime': 1646370961},
  {'duration': 616, 'risetime': 1646376767},
  {'duration': 394, 'risetime': 1646382627},
  {'duration': 400, 'risetime':

In [94]:
from datetime import datetime 
datetime.fromtimestamp(1646180995)

datetime.datetime(2022, 3, 2, 1, 29, 55)

### Random urls with information and lists 

In [105]:
response=requests.get("https://api.agify.io?name=Kofi")

In [106]:
response.content

b'{"name":"Kofi","age":46,"count":1948}'

In [107]:
response=requests.get("https://api.nationalize.io?name=sian")

In [108]:
response.content

b'{"name":"sian","country":[{"country_id":"GB","probability":0.7679432638846571},{"country_id":"MY","probability":0.1704197578464618},{"country_id":"","probability":0.046867950112622875}]}'

In [109]:
response=requests.get('http://universities.hipolabs.com/search?country=Spain')

In [110]:
response.content

b'[{"web_pages": ["http://www.barcelonagse.eu/"], "country": "Spain", "state-province": null, "name": "Barcelona Graduate School of Economics", "domains": ["barcelonagse.eu"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.bircham.edu/"], "country": "Spain", "state-province": null, "name": "Bircham International University", "domains": ["bircham.edu"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.ceu.es/"], "country": "Spain", "state-province": null, "name": "Universidad de San Pablo CEU", "domains": ["ceu.es"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.deusto.es/"], "country": "Spain", "state-province": null, "name": "Universidad de Deusto", "domains": ["deusto.es"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.ehu.es/"], "country": "Spain", "state-province": null, "name": "Universidad del Pa\\u00eds Vasco", "domains": ["ehu.es", "ehu.eus"], "alpha_two_code": "ES"}, {"web_pages": ["http://www.esic.es/"], "country": "Spain", "state-province": null, "name": 

In [None]:
# openweathermap- optional activity 
# https://knasmueller.net/using-the-open-weather-map-api-with-python 