# Project 3: Munging and analyzing data from the web

In [None]:
from bs4 import BeautifulSoup
import requests
import seaborn as sns
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.simplefilter("ignore", UserWarning)


## Getting data and cleaning

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'}

html_best = requests.get('https://www.imdb.com/chart/top/?ref_=nv_mv_250', headers=headers)
html_worst = requests.get('https://m.imdb.com/chart/bottom/', headers=headers)
soup_best = BeautifulSoup(html_best.text, 'html.parser')
soup_worst = BeautifulSoup(html_worst.text, 'html.parser')
display(soup_worst)

In [None]:
top_movies = soup_best.find_all('li', attrs={'class': 'ipc-metadata-list-summary-item sc-1364e729-0 caNpAE cli-parent'})

titles_best = [re.sub(r'\d+\.\s*','' ,str(item.find('h3', attrs={'class':'ipc-title__text'}).contents[0])) for item in top_movies]
display(titles_best)

bottom_movies = soup_worst.find_all('li', attrs={'class': 'ipc-metadata-list-summary-item sc-1364e729-0 caNpAE cli-parent'})

titles_worst = [re.sub(r'\d+\.\s*','' ,str(item.find('h3', attrs={'class':'ipc-title__text'}).contents[0])) for item in bottom_movies]
display(titles_worst)

In [None]:
ratings = soup_best.find_all('div', attrs={'class': 'sc-be6f1408-7 iUtHEN cli-title-metadata'})
rating_items = [item.find_all('span', attrs={'class': 'sc-be6f1408-8 fcCUPU cli-title-metadata-item'}) for item in ratings]
display(rating_items)

ratings_worst = soup_worst.find_all('div', attrs={'class': 'sc-be6f1408-7 iUtHEN cli-title-metadata'})
rating_items_worst = [item.find_all('span', attrs={'class': 'sc-be6f1408-8 fcCUPU cli-title-metadata-item'}) for item in ratings_worst]
display(rating_items_worst)

In [None]:
pd.set_option('mode.chained_assignment', None)

df = pd.DataFrame()

years_list_best = [] 
length_list_best = []
rating_list_best = []

for i in rating_items:
    years_list_best.append(int(i[0].contents[0]))
    length_list_best.append(i[1].contents[0])
    if len(i) > 2:
        rating_list_best.append(i[2].contents[0])
    else:
        rating_list_best.append('Not Rated')

df['Titles'] = titles_best
df['Years'] = years_list_best
df['Length'] = length_list_best
df['Rating'] = rating_list_best



for i in range(len(df['Length'])):
    if 'm' not in df['Length'][i]:
        s = pd.to_datetime(df.Length[i],format='%Hh')
        df.replace({'Length': df['Length'][i]}, s.hour*60)
        df['Length'][i] = float(s.hour*60)
    elif 'h' not in df['Length'][i]:
        s = pd.to_datetime(df.Length[i],format='%Mm')
        df.replace({'Length': df['Length'][i]}, s.minute)
        df['Length'][i] = float(s.minute)
    else:
        s = pd.to_datetime(df.Length[i], format='%Hh %Mm')
        df.replace({'Length': df['Length'][i]}, s.hour*60+s.minute)
        df['Length'][i] = float(s.hour*60+s.minute)


df['Length'] = df.Length.astype(float)
df

In [None]:
df_worst = pd.DataFrame()

years_list_worst = [] 
length_list_worst = []
rating_list_worst = []

for i in rating_items_worst:
    years_list_worst.append(int(i[0].contents[0]))
    length_list_worst.append(i[1].contents[0])
    if len(i) > 2:
        rating_list_worst.append(i[2].contents[0])
    else:
        rating_list_worst.append('Not Rated')

df_worst['Titles'] = titles_worst
df_worst['Years'] = years_list_worst
df_worst['Length'] = length_list_worst
df_worst['Rating'] = rating_list_worst



for i in range(len(df_worst['Length'])):
    if 'm' not in df_worst['Length'][i]:
        s = pd.to_datetime(df_worst.Length[i],format='%Hh')
        df_worst.replace({'Length': df_worst['Length'][i]}, s.hour*60)
        df_worst['Length'][i] = float(s.hour*60)
        
    elif 'h' not in df_worst['Length'][i]:
        s = pd.to_datetime(df_worst.Length[i],format='%Mm')
        df_worst.replace({'Length': df_worst['Length'][i]}, s.minute)
        df_worst['Length'][i] = float(s.minute)
    else:
        s = pd.to_datetime(df_worst.Length[i], format='%Hh %Mm')
        df_worst.replace({'Length': df_worst['Length'][i]}, s.hour*60+s.minute)
        df_worst['Length'][i] = float(s.hour*60+s.minute)



df_worst['Length'] = df_worst.Length.astype(float)
df_worst

In [None]:
user_rating_best = soup_best.find_all('span', attrs={'class':'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating'})
user_rating_comb_best = [item.get_text(strip=True ) for item in user_rating_best]
user_rating_best = [float(re.sub(r'\(.+\)', '', item)) for item in user_rating_comb_best]
# display(user_rating_best)

df['ReviewRating'] = user_rating_best
display(df)

user_rating_worst = soup_worst.find_all('span', attrs={'class':'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating'})
user_rating_comb_worst = [item.get_text(strip=True ) for item in user_rating_worst]
user_rating_worst = [float(re.sub(r'\(.+\)', '', item)) for item in user_rating_comb_worst]
# display(user_rating_worst)

df_worst['ReviewRating'] = user_rating_worst
display(df_worst)

In [None]:
rating_count_best = [re.search(r'\(\d+(\.\d)?\w', item).group() for item in user_rating_comb_best]
rating_count_best = [re.sub(r'\(', '', item) for item in rating_count_best]

rating_count_worst = [re.search(r'\(\d+(\.\d)?\w', item).group() for item in user_rating_comb_worst]
rating_count_worst = [re.sub(r'\(', '', item) for item in rating_count_worst]

for i in range(len(rating_count_best)):

    if rating_count_best[i][-1] == 'M':

        rating_count_best[i] = float(rating_count_best[i][:-1])*1000000
    elif rating_count_best[i][-1] == "K":

        rating_count_best[i] = float(rating_count_best[i][:-1])*1000
# display(rating_count_best)

for i in range(len(rating_count_worst)):

    if rating_count_worst[i][-1] == 'M':

        rating_count_worst[i] = float(rating_count_worst[i][:-1])*1000000
    elif rating_count_worst[i][-1] == "K":

        rating_count_worst[i] = float(rating_count_worst[i][:-1])*1000
# display(rating_count_worst)


df['numReviews'] = rating_count_best
df_worst['numReviews'] = rating_count_worst
display(df)
display(df_worst)

## Number of ratings vs length

In [None]:
p_best = sns.regplot(x = 'Length', y = 'ReviewRating', data=df)

p_best.set_xlabel('Movie Length (minutes)')
p_best.set_ylabel('Review Rating (1-10)')
p_best.set_title('Top 250 Movie Length vs. Review Rating')

In [None]:
p_worst = sns.regplot(x = 'Length', y = 'ReviewRating', data=df_worst)

p_worst.set_xlabel('Movie Length (minutes)')
p_worst.set_ylabel('Review Rating (1-10)')
p_worst.set_title('100 Lowest Rated Movie Length vs. Review Rating')

In [None]:
print("Best:", stats.pearsonr(df.Length, df.ReviewRating))
print("Worst:", stats.pearsonr(df_worst.Length, df_worst.ReviewRating))

## Average movie length and review rating for top 250 vs bottom 100

In [None]:
print("Best")
best_average_rating = sum(df['ReviewRating']) / len(df['ReviewRating'])
display(best_average_rating)


best_average_length = sum(df['Length']) / len(df['Length'])
display(best_average_length)

print("Worst")
worst_average_rating = sum(df_worst['ReviewRating']) / len(df_worst['ReviewRating'])
display(worst_average_rating)

worst_average_length = sum(df_worst['Length']) / len(df_worst['Length'])
display(worst_average_length)

## Mean movie length and review rating for top 250 vs bottom 100

In [None]:
print("Best")

display(df['ReviewRating'].mean())
display(df['Length'].mean())


print("Worst")

display(df_worst['ReviewRating'].mean())
display(df_worst['Length'].mean())

In [None]:
display(stats.ttest_ind(df.Length, df_worst.Length))

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(df['Length'],label='Best Movies')
sns.kdeplot(df_worst['Length'], label='Worst Movies')
fig.suptitle('Density of Movie lengths for Best and Worst Movies')
plt.legend()

# Number of reviews vs. Average rating

In [None]:
sns.scatterplot(data=df, x='ReviewRating', y='numReviews')
plt.figure()
sns.regplot(data=df, x='ReviewRating', y='numReviews')
print('all movies: ', stats.pearsonr(x=df['ReviewRating'], y=df['numReviews']))
plt.title('Average Rating vs number of reviews')
plt.figure()

# Does MPAA rating affect the reviews? 

In [None]:
# Films that are not rated were not made in the US
print(df['Rating'].unique())
print(df['Rating'].value_counts().keys())

mapping = {'R':'Restricted', 'X':'Restricted', '18+': 'Restricted', 'TV-MA':'Restricted', 'G':'Unrestricted', 
           'PG':'Unrestricted', 'TV-PG':'Unrestricted', 'PG-13':'Unrestricted', 'Unrated':'Other', 
           'Passed':'Other', 'Approved':'Other', 'GP':'Other', 'Not Rated':'Other'}
df['Rating_cat'] = df['Rating'].replace(mapping)


sns.countplot(data=df, x='Rating_cat', hue='Rating_cat')
plt.title('Rating category counts')
plt.figure()
sns.kdeplot(data=df[df['Rating_cat'] == 'Restricted'], x='ReviewRating', label='Restricted')
sns.kdeplot(data=df[df['Rating_cat'] == 'Unrestricted'], x='ReviewRating', label = 'Unrestricted')
sns.kdeplot(data=df[df['Rating_cat'] == 'Other'], x='ReviewRating', label='Other')
plt.legend()
plt.title('Rating category densities')
age_restricted = df[df['Rating_cat'] == 'Restricted']['ReviewRating']
not_age_restricted = df[df['Rating_cat'] == 'Unrestricted']['ReviewRating']
not_age_restricted = df[df['Rating_cat'] == 'Other']['ReviewRating']
stats.ttest_ind(age_restricted, not_age_restricted, nan_policy='omit')