In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests, html
from tqdm import tqdm
import re
import json
from datetime import datetime
import logging

logging.basicConfig(format='%(levelname)s\t%(asctime)s\t%(funcName)s\t%(message)s')

In [2]:
def all_page_link(p_start_url: str, p_log_level:str = 'INFO') -> list:
    
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)
    
    all_urls = []
    url = p_start_url
        
    logger.info(f'Start URL: {url}')
        
    while(url != None):            #Loop around all the required webpages and terminates when last page arive!
        all_urls.append(url)
        soup = BeautifulSoup(requests.get(url).text,"html.parser")
        next_links = soup.find_all(class_='flat-button lister-page-next next-page')    #Extracts the next page link.
        if (len(next_links) == 0):         # If their is no next page, it returns 0.
            url = None
        
            logger.debug('No more pages')
                
        else:
            next_page = "https://www.imdb.com" + next_links[0].get('href')
                
            url = next_page
        
            logger.debug(f'Next page: {url}')
            
    logger.info(f'Finished, Page links are gathered for URL: {url}')
    logger.debug(f'URLs:\n{all_urls}')
                
    return all_urls

In [3]:
def extract_imdb_data(p_content: str, p_log_level:str = 'INFO') -> list:
    
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)
        
    logger.info('Started')
        
    logger.debug(f'Current content:\n{p_content}')
    
    soup = BeautifulSoup(p_content, 'html.parser')

    soup_data = soup.find("script", type="application/ld+json").text

    imdb_data = json.loads(soup_data)
    
    l_movie_name = html.unescape(imdb_data['name'])
        
    logger.debug(f'Data extracted from content:\n{imdb_data}')

    soup_oscars = soup.findAll("a", attrs={"class":"ipc-metadata-list-item__label ipc-metadata-list-item__label--link"})
        
    num_of_oscars = 0
    for i in soup_oscars:
        if re.search('Won(.+?)Oscars',i.text):
            num_of_oscars = int(re.findall(r'\d+',i.text)[0])

            logger.debug(f'Found Oscars: {num_of_oscars} in: {i.text}')
            
    try:
        release_date = imdb_data['datePublished']
    except KeyError as ke:
        logger.warning(f'Publish date was not found for: {l_movie_name}')
        release_date = 'N/A'
    
    l_return =  [l_movie_name,
                 release_date,
                 imdb_data['aggregateRating']['ratingValue'],
                 imdb_data['aggregateRating']['ratingCount'],
                 num_of_oscars]
        
    logger.info(f'Finished, Extracted data: {l_return}')
    
    return l_return

In [4]:
def extract_imdb_data(p_content: str, p_log_level: str = 'INFO') -> list:
    """
    Extracts data from IMDB page content: "name", "release_date", "rating", "votes", "oscars"
    :param p_content: str
        Content of the page to extract from ( Html response )
    :param p_log_level: str
        Log level to logging
    :return: list
        List of extracted content: "name", "release_date", "rating", "votes", "oscars"
    """

    # Initiate logging for this function, pad function name to 30 characters
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)

    logger.info('Started')

    logger.debug(f'Current content:\n{p_content}')

    soup = BeautifulSoup(p_content, 'html.parser')  # Parse page content

    soup_data = soup.find("script", type="application/ld+json").text  # Extract script data from page content

    logger.debug(f'Script data found:\n{soup_data}')

    imdb_data = json.loads(soup_data)  # Parse script data into JSON object

    # Movie names might contains html escape sequences
    # Must unescape them to get the actual name
    l_movie_name = html.unescape(imdb_data['name'])

    logger.debug(f'Data extracted from content:\n{imdb_data}')

    # Parse for data containing Awards won and Nominations info
    soup_oscars = soup.findAll("a",
                               attrs={"class": "ipc-metadata-list-item__label ipc-metadata-list-item__label--link"})

    num_of_oscars = 0  # Default Oscars won is zero
    for i in soup_oscars:  # If there is a section with relevant data, lopp through
        if re.search('Won(.+?)Oscars', i.text):  # If section contains text like 'Won X Oscars'
            num_of_oscars = int(re.findall(r'\d+', i.text)[0])  # Extract integer of Oscars won

            logger.debug(f'Found Oscars: {num_of_oscars} in: {i.text}')

    # There are some very rare cases, where date of release is not present
    # due to the movie being release before relevant info was recorded
    try:
        release_date = imdb_data['datePublished']  # Try getting datePublished field from movie data
    except KeyError as ke:  # in case there is no such field
        logger.warning(f'Publish date was not found for: {l_movie_name}')  # Log a warning and set release date as "N/A"
        release_date = 'N/A'

    l_return = [l_movie_name,
                release_date,
                imdb_data['aggregateRating']['ratingValue'],
                imdb_data['aggregateRating']['ratingCount'],
                num_of_oscars]

    logger.info(f'Finished, Extracted data: {l_return}')

    return l_return

In [5]:
def extract_imdb_top_250_data(p_log_level:str = 'INFO') -> pd.DataFrame:
    
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)
        
    logger.info('Started')
    
    l_top_250_url = "https://www.imdb.com/list/ls068082370/"
        
    logger.debug(f'Starting on URL: {l_top_250_url}')
        
    t = tqdm(all_page_link(p_start_url=l_top_250_url, p_log_level=p_log_level))
    
    links_set = set()
    for i in t:
        soup = BeautifulSoup(requests.get(i).text,"html.parser")
        links = [a['href'] for a in soup.select('a[href]')]
        current_link_set = set(list(filter(lambda link: 'title/tt' in link, links)))
        
        logger.debug(f'Adding links:\n{current_link_set}')
        
        links_set = links_set.union(current_link_set)
        
    logger.debug(f'Current set of links:\n{links}')
        
    imdb_top_250_data = []
    for link in links_set:
        l_current_link = f'https://www.imdb.com{link}'
        
        logger.debug(f'Extracting data from URL:{l_current_link}')
            
        l_content = requests.get(l_current_link).content
        imdb_top_250_data.append(extract_imdb_data(p_content=l_content, p_log_level=p_log_level))
        
    index = ["name", "release_date", "rating", "votes", "oscars"]

    df = pd.DataFrame(imdb_top_250_data,columns=index)
        
    logger.info(f'Finished, Result dataframe:\n{df}')
    
    return df

In [6]:
def oscars_adjustment(p_num_of_oscars: int) -> float:
    if p_num_of_oscars == 0:
        return 0
    elif p_num_of_oscars > 0 and p_num_of_oscars < 3:
        return 0.3
    elif p_num_of_oscars > 2 and p_num_of_oscars < 6:
        return 0.5
    elif p_num_of_oscars > 5 and p_num_of_oscars < 11:
        return 1
    else:
        return 1.5

In [7]:
def write_imdb_data_to_csv(p_df:pd.DataFrame, 
                           p_file:str, 
                           p_sep:str = ';', 
                           p_overwrite:bool = False, 
                           p_log_level:str = 'INFO'):
    
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)
        
    logger.info(f'Started writing file: {p_file}')
    
    p_df.to_csv(path_or_buf=p_file, sep=p_sep, index=True, header=True, index_label='rank')
        
    logger.info(f'Finished writing file: {p_file}')

In [None]:
df = extract_imdb_top_250_data(p_log_level='INFO')

INFO	2022-10-23 14:42:56,566	extract_imdb_top_250_data	Started
INFO	2022-10-23 14:42:56,573	all_page_link	Start URL: https://www.imdb.com/list/ls068082370/
INFO	2022-10-23 14:43:09,711	all_page_link	Finished, Page links are gathered for URL: None
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:13<00:00,  4.37s/it]
INFO	2022-10-23 14:43:23,864	extract_imdb_data	Started
INFO	2022-10-23 14:43:23,997	extract_imdb_data	Finished, Extracted data: ['Se7en', '1996-11-07', 8.6, 1633996, 0]
INFO	2022-10-23 14:43:24,909	extract_imdb_data	Started
INFO	2022-10-23 14:43:25,037	extract_imdb_data	Finished, Extracted data: ['Indiana Jones and the Temple of Doom', '1986-07-31', 7.5, 486587, 0]
INFO	2022-10-23 14:43:25,941	extract_imdb_data	Started
INFO	2022-10-23 14:43:26,078	extract_imdb_data	Finished, Extracted data: ['El secreto de sus ojos', '2010-10-21', 8.2, 209572, 0]
INFO	2022-10-23 14:43:26,989	extract_imdb_data	Started
INFO	2022-10-23 14:43:27,

INFO	2022-10-23 14:44:16,209	extract_imdb_data	Started
INFO	2022-10-23 14:44:16,883	extract_imdb_data	Finished, Extracted data: ['The Godfather', '1982-03-25', 9.2, 1839009, 3]
INFO	2022-10-23 14:44:17,826	extract_imdb_data	Started
INFO	2022-10-23 14:44:17,953	extract_imdb_data	Finished, Extracted data: ['Some Like It Hot', '1965-08-05', 8.2, 266904, 0]
INFO	2022-10-23 14:44:18,930	extract_imdb_data	Started
INFO	2022-10-23 14:44:19,054	extract_imdb_data	Finished, Extracted data: ['Kill Bill: Vol. 1', '2003-10-16', 8.2, 1107239, 0]
INFO	2022-10-23 14:44:20,167	extract_imdb_data	Started
INFO	2022-10-23 14:44:20,276	extract_imdb_data	Finished, Extracted data: ['Annie Hall', '1980-05-22', 8, 265706, 4]
INFO	2022-10-23 14:44:21,188	extract_imdb_data	Started
INFO	2022-10-23 14:44:21,307	extract_imdb_data	Finished, Extracted data: ['Tonari no Totoro', '1988-04-16', 8.1, 337187, 0]
INFO	2022-10-23 14:44:22,286	extract_imdb_data	Started
INFO	2022-10-23 14:44:22,558	extract_imdb_data	Finished, E

INFO	2022-10-23 14:45:13,472	extract_imdb_data	Started
INFO	2022-10-23 14:45:13,791	extract_imdb_data	Finished, Extracted data: ['Once Upon a Time in America', '1989-03-30', 8.3, 350819, 0]
INFO	2022-10-23 14:45:14,762	extract_imdb_data	Started
INFO	2022-10-23 14:45:14,881	extract_imdb_data	Finished, Extracted data: ['Good Will Hunting', '1999-02-25', 8.3, 959367, 2]
INFO	2022-10-23 14:45:15,842	extract_imdb_data	Started
INFO	2022-10-23 14:45:15,957	extract_imdb_data	Finished, Extracted data: ['Gangs of Wasseypur', '2012-06-22', 8.2, 96215, 0]
INFO	2022-10-23 14:45:16,819	extract_imdb_data	Started
INFO	2022-10-23 14:45:17,064	extract_imdb_data	Finished, Extracted data: ['Scarface', '1983-12-09', 8.3, 833866, 0]
INFO	2022-10-23 14:45:17,902	extract_imdb_data	Started
INFO	2022-10-23 14:45:18,023	extract_imdb_data	Finished, Extracted data: ['3 Idiots', '2009-12-25', 8.4, 398133, 0]
INFO	2022-10-23 14:45:18,963	extract_imdb_data	Started
INFO	2022-10-23 14:45:19,082	extract_imdb_data	Finish

INFO	2022-10-23 14:46:11,091	extract_imdb_data	Started
INFO	2022-10-23 14:46:11,221	extract_imdb_data	Finished, Extracted data: ['Monsters, Inc.', '2002-02-14', 8.1, 903068, 0]
INFO	2022-10-23 14:46:12,141	extract_imdb_data	Started
INFO	2022-10-23 14:46:12,781	extract_imdb_data	Finished, Extracted data: ['Mou gaan dou', '2004-01-29', 8, 125008, 0]
INFO	2022-10-23 14:46:13,670	extract_imdb_data	Started
INFO	2022-10-23 14:46:13,801	extract_imdb_data	Finished, Extracted data: ['The Maltese Falcon', '1941-10-18', 8, 158889, 0]
INFO	2022-10-23 14:46:15,254	extract_imdb_data	Started
INFO	2022-10-23 14:46:15,381	extract_imdb_data	Finished, Extracted data: ["Le fabuleux destin d'Amélie Poulain", '2002-02-21', 8.3, 753735, 0]
INFO	2022-10-23 14:46:16,391	extract_imdb_data	Started
INFO	2022-10-23 14:46:16,508	extract_imdb_data	Finished, Extracted data: ['Intouchables', '2011-12-22', 8.5, 850590, 0]
INFO	2022-10-23 14:46:17,650	extract_imdb_data	Started
INFO	2022-10-23 14:46:17,768	extract_imdb_d

In [None]:
df.head(5)

In [None]:
df.sort_values('rating', ascending=False).head(20)

In [None]:
max_votes = df.sort_values('rating', ascending=False).head(20).max(axis = 0)['votes']

In [None]:
df['adjusted_rating1'] = df['rating'] + ((max_votes - df['votes']) // 100000  * -0.1)

In [None]:
df['adjusted_rating2'] =  [oscars_adjustment(x) for x in df['oscars']]

In [None]:
df['adjusted_rating'] = df['adjusted_rating1'] + df['adjusted_rating2']

In [None]:
df = df.drop("adjusted_rating1", axis='columns')
df = df.drop("adjusted_rating2", axis='columns')

In [None]:
sorted_df = df.sort_values('adjusted_rating', ascending=False).head(20).reindex().reset_index(drop=True)
sorted_df.index += 1

In [None]:
sorted_df

In [None]:
write_imdb_data_to_csv(p_df=sorted_df, p_file=f'imdb_top_250_adjusted_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv')