# Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import html5lib
import pandas as pd
import numpy as np

In [2]:
url=requests.get('https://www.imdb.com/chart/top/')
# print(url.text)

In [3]:
soup=BeautifulSoup(url.content,"html5lib")
print(soup.title.text)

IMDb Top 250 - IMDb


In [4]:
List =soup.find('tbody',{'class':'lister-list'}).find_all('tr')
movie,year,rating,dir_actors=[],[],[],[]

In [5]:
for film in List:
    movie.append(film.find('td',{'class':'titleColumn'}).a.text)
    year.append(film.find('td',{'class':'titleColumn'}).span.text)
    rating.append(film.find('td',{'class':'ratingColumn imdbRating'}).strong.text)
    dir_actors.append(film.find('td',{'class':'titleColumn'}).a.get('title'))

In [6]:
keys=['movie','year','rating','dir_actors']
values=[movie,year,rating,dir_actors]
dictionary = dict(zip(keys, values))

df = pd.DataFrame.from_dict(dictionary)
df

Unnamed: 0,movie,year,rating,dir_actors
0,The Shawshank Redemption,(1994),9.2,"Frank Darabont (dir.), Tim Robbins, Morgan Fre..."
1,The Godfather,(1972),9.1,"Francis Ford Coppola (dir.), Marlon Brando, Al..."
2,The Godfather: Part II,(1974),9.0,"Francis Ford Coppola (dir.), Al Pacino, Robert..."
3,The Dark Knight,(2008),9.0,"Christopher Nolan (dir.), Christian Bale, Heat..."
4,12 Angry Men,(1957),8.9,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb"
...,...,...,...,...
245,The Battle of Algiers,(1966),8.0,"Gillo Pontecorvo (dir.), Brahim Hadjadj, Jean ..."
246,A Silent Voice: The Movie,(2016),8.0,"Naoko Yamada (dir.), Miyu Irino, Saori Hayami"
247,Tangerines,(2013),8.0,"Zaza Urushadze (dir.), Lembit Ulfsak, Elmo Nüg..."
248,The Princess Bride,(1987),8.0,"Rob Reiner (dir.), Cary Elwes, Mandy Patinkin"


## Data Cleaning and Feature Engineering

In [7]:
df.year=df.year.str.extract('(\d\d\d\d)')[0].astype(int)
df.rating=df.rating.astype('float64')

In [8]:
df

Unnamed: 0,movie,year,rating,dir_actors
0,The Shawshank Redemption,1994,9.2,"Frank Darabont (dir.), Tim Robbins, Morgan Fre..."
1,The Godfather,1972,9.1,"Francis Ford Coppola (dir.), Marlon Brando, Al..."
2,The Godfather: Part II,1974,9.0,"Francis Ford Coppola (dir.), Al Pacino, Robert..."
3,The Dark Knight,2008,9.0,"Christopher Nolan (dir.), Christian Bale, Heat..."
4,12 Angry Men,1957,8.9,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb"
...,...,...,...,...
245,The Battle of Algiers,1966,8.0,"Gillo Pontecorvo (dir.), Brahim Hadjadj, Jean ..."
246,A Silent Voice: The Movie,2016,8.0,"Naoko Yamada (dir.), Miyu Irino, Saori Hayami"
247,Tangerines,2013,8.0,"Zaza Urushadze (dir.), Lembit Ulfsak, Elmo Nüg..."
248,The Princess Bride,1987,8.0,"Rob Reiner (dir.), Cary Elwes, Mandy Patinkin"


In [9]:
df['directors']=df['dir_actors'].str.split(',').apply(lambda x:x[0].rstrip('(dir.) ').strip())
df['actors']=df['dir_actors'].str.split(',').apply(lambda x:[i.strip() for i in x[1:]])

In [10]:
df.drop('dir_actors',axis=1,inplace=True)

In [11]:
df.head()

Unnamed: 0,movie,year,rating,directors,actors
0,The Shawshank Redemption,1994,9.2,Frank Darabont,"[Tim Robbins, Morgan Freeman]"
1,The Godfather,1972,9.1,Francis Ford Coppola,"[Marlon Brando, Al Pacino]"
2,The Godfather: Part II,1974,9.0,Francis Ford Coppola,"[Al Pacino, Robert De Niro]"
3,The Dark Knight,2008,9.0,Christopher Nolan,"[Christian Bale, Heath Ledger]"
4,12 Angry Men,1957,8.9,Sidney Lumet,"[Henry Fonda, Lee J. Cobb]"


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movie      250 non-null    object 
 1   year       250 non-null    int64  
 2   rating     250 non-null    float64
 3   directors  250 non-null    object 
 4   actors     250 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 9.9+ KB


In [13]:
df.to_csv('imdb.csv', index=False,sep=';',float_format='%.1f')