In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests, html
from tqdm import tqdm
import re
import json
from datetime import datetime
import logging

logging.basicConfig(format='%(levelname)s\t%(asctime)s\t%(funcName)s\t%(message)s')

In [2]:
def all_page_link(p_start_url: str, p_log_level:str = 'INFO') -> list:
    
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)
    
    all_urls = []
    url = p_start_url
        
    logger.info(f'Start URL: {url}')
        
    while(url != None):            #Loop around all the required webpages and terminates when last page arive!
        all_urls.append(url)
        soup = BeautifulSoup(requests.get(url).text,"html.parser")
        next_links = soup.find_all(class_='flat-button lister-page-next next-page')    #Extracts the next page link.
        if (len(next_links) == 0):         # If their is no next page, it returns 0.
            url = None
        
            logger.debug('No more pages')
                
        else:
            next_page = "https://www.imdb.com" + next_links[0].get('href')
                
            url = next_page
        
            logger.debug(f'Next page: {url}')
            
    logger.info(f'Finished, Page links are gathered for URL: {url}')
    logger.debug(f'URLs:\n{all_urls}')
                
    return all_urls

In [3]:
def extract_imdb_data(p_content: str, p_log_level:str = 'INFO') -> list:
    
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)
        
    logger.info('Started')
        
    logger.debug(f'Current content:\n{p_content}')
    
    soup = BeautifulSoup(p_content, 'html.parser')

    soup_data = soup.find("script", type="application/ld+json").text

    imdb_data = json.loads(soup_data)
    
    l_movie_name = html.unescape(imdb_data['name'])
        
    logger.debug(f'Data extracted from content:\n{imdb_data}')

    soup_oscars = soup.findAll("a", attrs={"class":"ipc-metadata-list-item__label ipc-metadata-list-item__label--link"})
        
    num_of_oscars = 0
    for i in soup_oscars:
        if re.search('Won(.+?)Oscars',i.text):
            num_of_oscars = int(re.findall(r'\d+',i.text)[0])

            logger.debug(f'Found Oscars: {num_of_oscars} in: {i.text}')
            
    try:
        release_date = imdb_data['datePublished']
    except KeyError as ke:
        logger.warning(f'Publish date was not found for: {l_movie_name}')
        release_date = 'N/A'
    
    l_return =  [l_movie_name,
                 release_date,
                 imdb_data['aggregateRating']['ratingValue'],
                 imdb_data['aggregateRating']['ratingCount'],
                 num_of_oscars]
        
    logger.info(f'Finished, Extracted data: {l_return}')
    
    return l_return

In [4]:
def extract_imdb_top_250_data(p_log_level:str = 'INFO') -> pd.DataFrame:
    
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)
        
    logger.info('Started')
    
    l_top_250_url = "https://www.imdb.com/list/ls068082370/"
        
    logger.debug(f'Starting on URL: {l_top_250_url}')
        
    t = tqdm(all_page_link(p_start_url=l_top_250_url, p_log_level=p_log_level))
    
    links_set = set()
    for i in t:
        soup = BeautifulSoup(requests.get(i).text,"html.parser")
        links = [a['href'] for a in soup.select('a[href]')]
        current_link_set = set(list(filter(lambda link: 'title/tt' in link, links)))
        
        logger.debug(f'Adding links:\n{current_link_set}')
        
        links_set = links_set.union(current_link_set)
        
    logger.debug(f'Current set of links:\n{links}')
        
    imdb_top_250_data = []
    for link in links_set:
        l_current_link = f'https://www.imdb.com{link}'
        
        logger.debug(f'Extracting data from URL:{l_current_link}')
            
        l_content = requests.get(l_current_link).content
        imdb_top_250_data.append(extract_imdb_data(p_content=l_content, p_log_level=p_log_level))
        
    index = ["name", "release_date", "rating", "votes", "oscars"]

    df = pd.DataFrame(imdb_top_250_data,columns=index)
        
    logger.info(f'Finished, Result dataframe:\n{df}')
    
    return df

In [5]:
def oscars_adjustment(p_num_of_oscars: int) -> float:
    if p_num_of_oscars == 0:
        return 0
    elif p_num_of_oscars > 0 and p_num_of_oscars < 3:
        return 0.3
    elif p_num_of_oscars > 2 and p_num_of_oscars < 6:
        return 0.5
    elif p_num_of_oscars > 5 and p_num_of_oscars < 11:
        return 1
    else:
        return 1.5

In [6]:
def write_imdb_data_to_csv(p_df:pd.DataFrame, 
                           p_file:str, 
                           p_sep:str = ';', 
                           p_overwrite:bool = False, 
                           p_log_level:str = 'INFO'):
    
    logger = logging.getLogger(__name__.ljust(30, ' '))
    logger.setLevel(p_log_level)
        
    logger.info(f'Started writing file: {p_file}')
    
    p_df.to_csv(path_or_buf=p_file, sep=p_sep, index=True, header=True, index_label='rank')
        
    logger.info(f'Finished writing file: {p_file}')

In [None]:
df = extract_imdb_top_250_data(p_log_level='INFO')

INFO	2022-10-23 11:39:01,616	extract_imdb_top_250_data	Started
INFO	2022-10-23 11:39:01,622	all_page_link	Start URL: https://www.imdb.com/list/ls068082370/
INFO	2022-10-23 11:39:17,235	all_page_link	Finished, Page links are gathered for URL: None
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:12<00:00,  4.08s/it]
INFO	2022-10-23 11:39:30,890	extract_imdb_data	Started
INFO	2022-10-23 11:39:31,445	extract_imdb_data	Finished, Extracted data: ['M - Eine Stadt sucht einen Mörder', '1931-08-31', 8.3, 157942, 0]
INFO	2022-10-23 11:39:32,421	extract_imdb_data	Started
INFO	2022-10-23 11:39:33,037	extract_imdb_data	Finished, Extracted data: ['The Prestige', '2007-01-04', 8.5, 1322013, 0]
INFO	2022-10-23 11:39:34,386	extract_imdb_data	Started
INFO	2022-10-23 11:39:34,507	extract_imdb_data	Finished, Extracted data: ['Modern Times', '1936-10-07', 8.5, 241850, 0]
INFO	2022-10-23 11:39:36,018	extract_imdb_data	Started
INFO	2022-10-23 11:39:36,155	ex

In [None]:
df.head(5)

In [None]:
df.sort_values('rating', ascending=False).head(20)

In [None]:
max_votes = df.sort_values('rating', ascending=False).head(20).max(axis = 0)['votes']

In [None]:
df['adjusted_rating1'] = df['rating'] + ((max_votes - df['votes']) // 100000  * -0.1)

In [None]:
df['adjusted_rating2'] =  [oscars_adjustment(x) for x in df['oscars']]

In [None]:
df['adjusted_rating'] = df['adjusted_rating1'] + df['adjusted_rating2']

In [None]:
df = df.drop("adjusted_rating1", axis='columns')
df = df.drop("adjusted_rating2", axis='columns')

In [None]:
sorted_df = df.sort_values('adjusted_rating', ascending=False).head(20).reindex().reset_index(drop=True)
sorted_df.index += 1

In [None]:
sorted_df

In [None]:
write_imdb_data_to_csv(p_df=sorted_df, p_file=f'imdb_top_250_adjusted_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv')