In [None]:
from bs4 import BeautifulSoup
import pandas as pd 
import seaborn as sns
import requests
import re


In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'}
title_page = requests.get('https://www.imdb.com/chart/top/?ref_=nv_mv_250', headers=headers)
tp_soup = BeautifulSoup(title_page.content, 'html.parser')

In [None]:
top_movies = tp_soup.find_all('li', attrs={'class': 'ipc-metadata-list-summary-item sc-1364e729-0 caNpAE cli-parent'})
titles = [re.sub(r'\d+\.\s*','' ,str(item.find('h3', attrs={'class':'ipc-title__text'}).contents[0])) for item in top_movies]
display(titles)


In [None]:
user_rating = tp_soup.find_all('span', attrs={'class':'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating'})
user_rating_comb = [item.get_text(strip=True ) for item in user_rating]
user_rating = [re.sub(r'\(.+\)', '', item) for item in user_rating_comb]
user_rating

In [None]:
rating_count = [re.search(r'\(\d+(\.\d)?\w', item).group() for item in user_rating_comb]
rating_count = [re.sub(r'\(', '', item) for item in rating_count]
# display(rating_count)
for i in range(len(rating_count)):

    if rating_count[i][-1] == 'M':

        rating_count[i] = float(rating_count[i][:-1])*1000000
    elif rating_count[i][-1] == "K":

        rating_count[i] = float(rating_count[i][:-1])*1000
rating_count

len(rating_count)

First Analysis: Year the movies came out.

Note: The first movie with synchronized dialogue was released in 1927

Second Analysis: Place in table vs number of ratings

In [None]:
ratings = tp_soup.find_all('div', attrs={'class': 'sc-be6f1408-7 iUtHEN cli-title-metadata'})
rating_items = [item.find_all('span', attrs={'class': 'sc-be6f1408-8 fcCUPU cli-title-metadata-item'}) for item in ratings]
display(rating_items)

In [None]:
place = [i for i in range(1, 251)]


movie_df = pd.DataFrame({'Title':titles, 'Place': place, 'Number_of_ratings': rating_count, 'Average_rating':user_rating})

years_list_best = [] 
length_list_best = []
rating_list_best = []

for i in rating_items:

    
    years_list_best.append(int(i[0].contents[0]))
    length_list_best.append(i[1].contents[0])
    if len(i) > 2:
        rating_list_best.append(i[2].contents[0])
    else:
        rating_list_best.append('Not Rated')
movie_df['Years'] = years_list_best
movie_df['Length'] = length_list_best
movie_df['Rating'] = rating_list_best

movie_df['Average_rating'] = pd.to_numeric(movie_df['Average_rating'])
movie_df

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

sns.scatterplot(data=movie_df, x='Average_rating', y='Number_of_ratings')
plt.figure()
sns.regplot(data=movie_df, x='Average_rating', y='Number_of_ratings')
print('all movies: ', stats.pearsonr(x=movie_df['Average_rating'], y=movie_df['Number_of_ratings']))
plt.title('Average Rating vs number of reviews')
plt.figure()

In [None]:
# Films that are not rated were not made in the US
print(movie_df['Rating'].unique())
print(movie_df['Rating'].value_counts().keys())

mapping = {'R':'Restricted', 'X':'Restricted', '18+': 'Restricted', 'TV-MA':'Restricted', 'G':'Unrestricted', 
           'PG':'Unrestricted', 'TV-PG':'Unrestricted', 'PG-13':'Unrestricted', 'Unrated':'Other', 
           'Passed':'Other', 'Approved':'Other', 'GP':'Other', 'Not Rated':'Other'}
movie_df['Rating_cat'] = movie_df['Rating'].replace(mapping)


sns.countplot(data=movie_df, x='Rating_cat', hue='Rating_cat')
plt.title('Rating category counts')
plt.figure()
sns.kdeplot(data=movie_df[movie_df['Rating_cat'] == 'Restricted'], x='Average_rating', label='Restricted')
sns.kdeplot(data=movie_df[movie_df['Rating_cat'] == 'Unrestricted'], x='Average_rating', label = 'Unrestricted')
sns.kdeplot(data=movie_df[movie_df['Rating_cat'] == 'Other'], x='Average_rating', label='Other')
plt.legend()
plt.title('Rating category densities')
age_restricted = movie_df[movie_df['Rating_cat'] == 'Restricted']['Average_rating']
not_age_restricted = movie_df[movie_df['Rating_cat'] == 'Unrestricted']['Average_rating']
not_age_restricted = movie_df[movie_df['Rating_cat'] == 'Other']['Average_rating']
stats.ttest_ind(age_restricted, not_age_restricted, nan_policy='omit')