# Web scraping on the IMDB website using Python 

In [2]:
from bs4 import BeautifulSoup
import requests
import openpyxl
import pandas as pd

In [3]:
#  we are going to access these list of movies in the URL below and return response object that has the html source code

excel = openpyxl.Workbook()
sheet = excel.active
sheet.title = 'top rated movies 2'


sheet.append(['Id','Movie Rank', 'Movie Name', 'Year of Movie', 'IMDB Rating'])

try:
    source = requests.get('https://www.imdb.com/chart/top/?ref_=nv_mv_250')
    # in order to capture an error cuz request does not differentiate between....
    source.raise_for_status()
    soup = BeautifulSoup(source.text,'html.parser')
    movies = soup.find('tbody', class_ = 'lister-list').find_all('tr')
    
    for movie in movies :
        
        Id = movie.find('div', {'data-titleid' : True}).get('data-titleid')
        name = movie.find('td', class_ = 'titleColumn').a.text
        rank = movie.find('td', class_ = 'titleColumn').get_text(strip = True).split('.')[0]
        year = movie.find('td', class_ = 'titleColumn').span.text.strip('()')
        rating = movie.find('td', class_ = 'ratingColumn imdbRating').strong.text
        #print(rank, name, year, rating)
        
        sheet.append([Id,rank, name, year, rating])
        
    excel.save('./IMDB_Movie_test.xlsx')

except Exception as e:
    print(e)
    
        

## Since we need the country of each classified Movie which can be found in other (URL), in this case we need to put the Id of each movie and replace it in the (URL) after the title in that way we can access to the country attribute for each movie 

In [4]:
data = pd.read_excel('./IMDB_Movie_test.xlsx')

data.head()

Unnamed: 0,Id,Movie Rank,Movie Name,Year of Movie,IMDB Rating
0,tt0111161,1,Les Évadés,1994,9.2
1,tt0068646,2,Le Parrain,1972,9.1
2,tt0071562,3,"Le Parrain, 2ᵉ partie",1974,9.0
3,tt0468569,4,The Dark Knight : Le Chevalier noir,2008,9.0
4,tt0050083,5,12 Hommes en colère,1957,8.9


In [8]:

for i,item in data.iterrows():
    try:
    
        source = requests.get("https://www.imdb.com/title/" + str(item['Id']))
    # in order to capture an error cuz request does not diffrentiet between....
        source.raise_for_status()
        soup = BeautifulSoup(source.text,'html.parser')
        country = soup.find('li', attrs={"data-testid": "title-details-origin"}).find('a').text
        data.at[i,'country'] =country
    
    except Exception as e:
        print(e)
    
   

In [9]:
data.head()

Unnamed: 0,Id,Movie Rank,Movie Name,Year of Movie,IMDB Rating,country
0,tt0111161,1,Les Évadés,1994,9.2,United States
1,tt0068646,2,Le Parrain,1972,9.1,United States
2,tt0071562,3,"Le Parrain, 2ᵉ partie",1974,9.0,United States
3,tt0468569,4,The Dark Knight : Le Chevalier noir,2008,9.0,United States
4,tt0050083,5,12 Hommes en colère,1957,8.9,United States


In [None]:
data.to_excel("./IMDB_data.xlsx")

## now we can import our IMDB data to Tableau Software and do some analysis 