In [39]:
import os
import pandas as pd
import requests
import numpy as np
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC


In [40]:
# Read in list of movies with low Tomatometer score
rotten_df = pd.read_csv ('rotten_df.csv',index_col=[0])

# Display full column width (for the links)
pd.set_option('display.max_colwidth', None)

In [41]:
# Remove non-numeric info from Year and N_Tomatometer columns
rotten_df['Year'] = rotten_df['Year'].str[:4]
rotten_df['N_Tomatometer'] = rotten_df['N_Tomatometer'].str[:-8]
rotten_df

Unnamed: 0,Tomatoes URL,Title,Tomatometer,Year,N_Tomatometer
0,https://www.rottentomatoes.com/m/the_second,The Second,63%,2018,8
1,https://www.rottentomatoes.com/m/needle_in_a_timestack,Needle in a Timestack,34%,2021,48
2,https://www.rottentomatoes.com/m/adventures_of_a_mathematician,Adventures of a Mathematician,50%,2020,10
3,https://www.rottentomatoes.com/m/de_gaulle,De Gaulle,40%,2020,10
4,https://www.rottentomatoes.com/m/the_estate_2021,The Estate,33%,2020,9
...,...,...,...,...,...
9493,https://www.rottentomatoes.com/m/maniac_cop,Maniac Cop,50%,1988,15
9494,https://www.rottentomatoes.com/m/devils_own,The Devil's Own,35%,1997,40
9495,https://www.rottentomatoes.com/m/single_white_female,Single White Female,53%,1992,49
9496,https://www.rottentomatoes.com/m/high_school_high,High School High,19%,1996,16


In [42]:
rotten_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9498 entries, 0 to 9497
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tomatoes URL   9498 non-null   object
 1   Title          9498 non-null   object
 2   Tomatometer    9498 non-null   object
 3   Year           9498 non-null   object
 4   N_Tomatometer  9498 non-null   object
dtypes: object(5)
memory usage: 445.2+ KB


In [43]:
# Datasets found on IMDB site contain data https://www.imdb.com/interfaces/

# Read in tsv file to match movie titles to IMDB unique ID (tconst)
basics_tsv = pd.read_csv('title_basics.tsv', delimiter="\t")

# Data on IMDB ratings / number of votes
ratings_tsv = pd.read_csv('title_ratings.tsv', delimiter="\t")

# # Data on movie directors / writers
# crew_tsv = pd.read_csv('title_crew.tsv', delimiter="\t")

# # Data on movie cast / crew
# principals_tsv = pd.read_csv('title_principals.tsv', delimiter="\t")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [44]:
basics_tsv['titleType'].value_counts()

tvEpisode       6298023
short            844347
movie            595347
video            255075
tvSeries         217091
tvMovie          133837
tvMiniSeries      40886
tvSpecial         35033
videoGame         29806
tvShort           10382
tvPilot               2
Name: titleType, dtype: int64

In [45]:
## Clean up basics_tsv file

# Select for only movies and videos
basics_tsv = basics_tsv[(basics_tsv['titleType']=='movie') | (basics_tsv['titleType']=='video')]
# Drop unnecessary columns
IMDB_ID_df = basics_tsv.drop(columns=['titleType','originalTitle', 'endYear'])
# Rename title and year column
IMDB_ID_df.rename(columns={"primaryTitle": "Title", "startYear": "Year"},inplace=True)

# Convert year to int
# IMDB_ID_df['startYear']=pd.to_numeric(IMDB_ID_df['startYear'], errors='coerce')
# IMDB_ID_df['startYear'] = IMDB_ID_df['startYear'].astype(str).astype(int,errors='ignore')

IMDB_ID_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 850422 entries, 498 to 8459779
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          850422 non-null  object
 1   Title           850422 non-null  object
 2   isAdult         850422 non-null  object
 3   Year            850422 non-null  object
 4   runtimeMinutes  850422 non-null  object
 5   genres          850422 non-null  object
dtypes: object(6)
memory usage: 45.4+ MB


In [73]:
ratings_tsv.sample(10)

Unnamed: 0,tconst,averageRating,numVotes
220807,tt0398438,8.0,138
848414,tt2988442,8.0,187
484865,tt1079781,6.0,5
797715,tt2319530,5.2,21
231396,tt0424053,7.0,66
1038419,tt6126480,7.7,6
1189650,tt9861480,5.8,99
309360,tt0602303,7.9,124
234530,tt0430949,8.1,10
109624,tt0158962,6.8,358


In [48]:
# Merge Rotten Tomatoes df with IMDB_ID df on two columns: Year and Title
merged_df = rotten_df.merge(IMDB_ID_df, on=["Year", "Title"])

In [49]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8136 entries, 0 to 8135
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Tomatoes URL    8136 non-null   object
 1   Title           8136 non-null   object
 2   Tomatometer     8136 non-null   object
 3   Year            8136 non-null   object
 4   N_Tomatometer   8136 non-null   object
 5   tconst          8136 non-null   object
 6   isAdult         8136 non-null   object
 7   runtimeMinutes  8136 non-null   object
 8   genres          8136 non-null   object
dtypes: object(9)
memory usage: 635.6+ KB


In [51]:
# Merge Rotten Tomatoes df with ratings_tsv df on tconst column
merged_df = merged_df.merge(ratings_tsv, on=["tconst"])

In [52]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7989 entries, 0 to 7988
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Tomatoes URL    7989 non-null   object 
 1   Title           7989 non-null   object 
 2   Tomatometer     7989 non-null   object 
 3   Year            7989 non-null   object 
 4   N_Tomatometer   7989 non-null   object 
 5   tconst          7989 non-null   object 
 6   isAdult         7989 non-null   object 
 7   runtimeMinutes  7989 non-null   object 
 8   genres          7989 non-null   object 
 9   averageRating   7989 non-null   float64
 10  numVotes        7989 non-null   int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 749.0+ KB


In [53]:
merged_df.head(50)

Unnamed: 0,Tomatoes URL,Title,Tomatometer,Year,N_Tomatometer,tconst,isAdult,runtimeMinutes,genres,averageRating,numVotes
0,https://www.rottentomatoes.com/m/the_second,The Second,63%,2018,8,tt5636922,0,94,"Drama,Mystery,Thriller",4.9,317
1,https://www.rottentomatoes.com/m/needle_in_a_timestack,Needle in a Timestack,34%,2021,48,tt7099280,0,111,"Drama,Fantasy,Romance",4.3,550
2,https://www.rottentomatoes.com/m/adventures_of_a_mathematician,Adventures of a Mathematician,50%,2020,10,tt6875374,0,102,"Biography,Drama",5.5,238
3,https://www.rottentomatoes.com/m/de_gaulle,De Gaulle,40%,2020,10,tt10880402,0,108,"Biography,Drama,History",6.0,1207
4,https://www.rottentomatoes.com/m/the_estate_2021,The Estate,33%,2020,9,tt10740928,0,85,"Comedy,Thriller",4.4,204
5,https://www.rottentomatoes.com/m/the_addams_family_2,The Addams Family 2,30%,2021,101,tt11125620,0,93,"Adventure,Animation,Comedy",5.4,7135
6,https://www.rottentomatoes.com/m/mayday_2021,Mayday,50%,2021,62,tt11271800,0,100,"Drama,Fantasy,Mystery",4.2,662
7,https://www.rottentomatoes.com/m/the_blazing_world_2021,The Blazing World,57%,2021,47,tt11378876,0,101,"Adventure,Comedy,Fantasy",4.5,391
8,https://www.rottentomatoes.com/m/seance_2021,Seance,47%,2021,62,tt11188624,0,92,"Horror,Mystery",5.3,2291
9,https://www.rottentomatoes.com/m/mark_mary_and_some_other_people,"Mark, Mary & Some Other People",56%,2021,25,tt11832624,0,90,Comedy,5.6,142


In [54]:
merged_df.to_csv('merged_df.csv')

### Search and scrape Box Office Mojo for gross revenue values

In [55]:
merged_df=pd.read_csv('merged_df.csv')

In [56]:
# Create new column with full Box Office Mojo url
merged_df['BOM url'] = "https://www.boxofficemojo.com/title/" + merged_df['tconst']+"/"
merged_df

Unnamed: 0.1,Unnamed: 0,Tomatoes URL,Title,Tomatometer,Year,N_Tomatometer,tconst,isAdult,runtimeMinutes,genres,averageRating,numVotes,BOM url
0,0,https://www.rottentomatoes.com/m/the_second,The Second,63%,2018,8,tt5636922,0,94,"Drama,Mystery,Thriller",4.9,317,https://www.boxofficemojo.com/title/tt5636922/
1,1,https://www.rottentomatoes.com/m/needle_in_a_timestack,Needle in a Timestack,34%,2021,48,tt7099280,0,111,"Drama,Fantasy,Romance",4.3,550,https://www.boxofficemojo.com/title/tt7099280/
2,2,https://www.rottentomatoes.com/m/adventures_of_a_mathematician,Adventures of a Mathematician,50%,2020,10,tt6875374,0,102,"Biography,Drama",5.5,238,https://www.boxofficemojo.com/title/tt6875374/
3,3,https://www.rottentomatoes.com/m/de_gaulle,De Gaulle,40%,2020,10,tt10880402,0,108,"Biography,Drama,History",6.0,1207,https://www.boxofficemojo.com/title/tt10880402/
4,4,https://www.rottentomatoes.com/m/the_estate_2021,The Estate,33%,2020,9,tt10740928,0,85,"Comedy,Thriller",4.4,204,https://www.boxofficemojo.com/title/tt10740928/
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7984,7984,https://www.rottentomatoes.com/m/maniac_cop,Maniac Cop,50%,1988,15,tt0095583,0,85,"Action,Crime,Horror",6.1,16065,https://www.boxofficemojo.com/title/tt0095583/
7985,7985,https://www.rottentomatoes.com/m/devils_own,The Devil's Own,35%,1997,40,tt0118972,0,111,"Action,Crime,Drama",6.2,63336,https://www.boxofficemojo.com/title/tt0118972/
7986,7986,https://www.rottentomatoes.com/m/single_white_female,Single White Female,53%,1992,49,tt0105414,0,107,"Drama,Thriller",6.4,35357,https://www.boxofficemojo.com/title/tt0105414/
7987,7987,https://www.rottentomatoes.com/m/high_school_high,High School High,19%,1996,16,tt0116531,0,86,"Comedy,Crime,Romance",5.5,13201,https://www.boxofficemojo.com/title/tt0116531/


In [57]:
# Prep for incoming IMDB data
merged_df['domestic'] = None
merged_df['international'] = None
merged_df['worldwide'] = None
merged_df['domestic_distributor'] = None
merged_df['domestic_opening'] = None
merged_df['mpaa'] = None
merged_df['Genres'] = None

merged_df

Unnamed: 0.1,Unnamed: 0,Tomatoes URL,Title,Tomatometer,Year,N_Tomatometer,tconst,isAdult,runtimeMinutes,genres,averageRating,numVotes,BOM url,domestic,international,worldwide,domestic_distributor,domestic_opening,mpaa,Genres
0,0,https://www.rottentomatoes.com/m/the_second,The Second,63%,2018,8,tt5636922,0,94,"Drama,Mystery,Thriller",4.9,317,https://www.boxofficemojo.com/title/tt5636922/,,,,,,,
1,1,https://www.rottentomatoes.com/m/needle_in_a_timestack,Needle in a Timestack,34%,2021,48,tt7099280,0,111,"Drama,Fantasy,Romance",4.3,550,https://www.boxofficemojo.com/title/tt7099280/,,,,,,,
2,2,https://www.rottentomatoes.com/m/adventures_of_a_mathematician,Adventures of a Mathematician,50%,2020,10,tt6875374,0,102,"Biography,Drama",5.5,238,https://www.boxofficemojo.com/title/tt6875374/,,,,,,,
3,3,https://www.rottentomatoes.com/m/de_gaulle,De Gaulle,40%,2020,10,tt10880402,0,108,"Biography,Drama,History",6.0,1207,https://www.boxofficemojo.com/title/tt10880402/,,,,,,,
4,4,https://www.rottentomatoes.com/m/the_estate_2021,The Estate,33%,2020,9,tt10740928,0,85,"Comedy,Thriller",4.4,204,https://www.boxofficemojo.com/title/tt10740928/,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7984,7984,https://www.rottentomatoes.com/m/maniac_cop,Maniac Cop,50%,1988,15,tt0095583,0,85,"Action,Crime,Horror",6.1,16065,https://www.boxofficemojo.com/title/tt0095583/,,,,,,,
7985,7985,https://www.rottentomatoes.com/m/devils_own,The Devil's Own,35%,1997,40,tt0118972,0,111,"Action,Crime,Drama",6.2,63336,https://www.boxofficemojo.com/title/tt0118972/,,,,,,,
7986,7986,https://www.rottentomatoes.com/m/single_white_female,Single White Female,53%,1992,49,tt0105414,0,107,"Drama,Thriller",6.4,35357,https://www.boxofficemojo.com/title/tt0105414/,,,,,,,
7987,7987,https://www.rottentomatoes.com/m/high_school_high,High School High,19%,1996,16,tt0116531,0,86,"Comedy,Crime,Romance",5.5,13201,https://www.boxofficemojo.com/title/tt0116531/,,,,,,,


In [70]:
short_df = merged_df[7000:]
short_df.reset_index(inplace=True,drop=True)
for index, row in short_df.iterrows():
    url = short_df.loc[index,'BOM url'] # Retrieves the IMDB url to search for
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'lxml') # Imports the HTML of the webpage into python 
    time.sleep(3) 
    
    revenue_box = soup.find('div',class_='a-section a-spacing-none mojo-performance-summary-table')
    
    try:
        revenue_box.find_all('span',class_=['money','percent zero'])
        domestic=revenue_box.find_all('span',class_=['money','percent zero'])[0].text      
        international=revenue_box.find_all('span',class_=['money','percent zero'])[1].text
        worldwide=revenue_box.find_all('span',class_=['money','percent zero'])[2].text
    except:
        domestic = np.nan
        international = np.nan
        worldwide = np.nan
    short_df['domestic'][index]=domestic
    short_df['international'][index]=international
    short_df['worldwide'][index]=worldwide
    try:
        domestic_distributor = soup.find(string = 'Domestic Distributor').findNext().text
    except:
        domestic_distributor = np.nan
    short_df['domestic_distributor'][index]=domestic_distributor
    try:
        domestic_opening = soup.find(string = 'Domestic Opening').findNext().text
    except:
        domestic_opening = np.nan
    short_df['domestic_opening'][index]=domestic_opening
    try:
        mpaa = soup.find(string = 'MPAA').findNext().text
    except:
        mpaa = np.nan
    short_df['mpaa'][index]=mpaa
    try:
        genres = soup.find(string = 'Genres').findNext().text
    except:
        genres = np.nan
    short_df['Genres'][index]=genres


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  short_df['domestic'][index]=domestic
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  short_df['international'][index]=international
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  short_df['worldwide'][index]=worldwide
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  short_df['domestic_distributor'][index]=domes

In [71]:
short_df.to_csv('full_7000_7990.csv')

In [66]:
short_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            2000 non-null   int64  
 1   Tomatoes URL          2000 non-null   object 
 2   Title                 2000 non-null   object 
 3   Tomatometer           2000 non-null   object 
 4   Year                  2000 non-null   int64  
 5   N_Tomatometer         2000 non-null   int64  
 6   tconst                2000 non-null   object 
 7   isAdult               2000 non-null   int64  
 8   runtimeMinutes        2000 non-null   object 
 9   genres                2000 non-null   object 
 10  averageRating         2000 non-null   float64
 11  numVotes              2000 non-null   int64  
 12  BOM url               2000 non-null   object 
 13  domestic              1673 non-null   object 
 14  international         1673 non-null   object 
 15  worldwide            

In [28]:
short_df = merged_df[3000:4000]
short_df.reset_index(inplace=True,drop=True)
short_df.iloc[633]

IndexError: single positional indexer is out-of-bounds