In [1]:
import requests 
from config import omdb_key
from config import tmdb_key
import pandas as pd
import numpy as np
import time


In [2]:
def tmdb_request(movie_id):
    #movie_id comes from calling 'imdbID' key from the omdb request
    import requests
    url = "https://api.themoviedb.org/3/movie/" + movie_id + "?language=en-US&api_key=" + tmdb_key
    
    #url = "https://api.themoviedb.org/3/movie/tt2527336"

    payload = "{}"
    response = requests.request("GET", url, data=payload)

    return response.json()
    
    
    

In [3]:
def omdb_request(movie_title):
    seperator = '+'
    title_url = seperator.join(movie_title.split())
    url = 'http://www.omdbapi.com/?' + 't=' + title_url + '&apikey=' + omdb_key
    r = requests.get(url)
    return r.json()

In [1]:
def make_int(df_name, column_name):
    integer = []
    
    for index, row in df_name.iterrows():
        if type(row[column_name]) != float:
            plus = row[column_name].split('$')
            dot = plus[-1].split('.')
            number = None

            if len(dot) == 2:
                number = dot[0]+dot[1]
                if number[-1] == 'k':
                    number = number.replace(',','')
                    number = number.replace('k', '00')
                else:
                    number = number.replace(',','') + '00000'
            else:
                number = dot[0].replace('k', '000')
                number = number.replace(',','')

            integer.append(int(number))
        else:
            integer.append(int(0))
    df_name[column_name] = integer


In [5]:
def bom_db(year):
    url = 'https://www.boxofficemojo.com/yearly/chart/?view2=worldwide&yr=' + str(year)+'&p=.htm'
    r = pd.read_html(requests.get(url).content)
    
    df = r[2]
    
    df.columns = ['Rank', 'Title', 'Studio', 'Worldwide_Gross', 'Domestic_Gross', 
                   'Domestic%','Overseas_Gross' ,'Overseas%']

    df['Year'] = year
    df.drop([0,1], inplace=True)
    df.reset_index(inplace=True)
    df.drop(columns=['index', 'Rank'], inplace=True)
    
    make_int(df, 'Worldwide_Gross')
    make_int(df, 'Domestic_Gross')
    make_int(df, 'Overseas_Gross')
    return df

In [6]:
def df_with_ratings(year):
    df = bom_db(year) #Creates dataframe using previous function
    #Looping through the title of each movie in the dataframe
    for i in range(0, len(df.Title)):
        try:
            print(df.Title[i])
            title_data = omdb_request(df.Title[i]) #Requests omdb to get data about specific movie
            if title_data['Response'] == 'False':
                df.loc[i, 'rotten_tomatoes_rating'] = np.nan
                df.loc[i, 'imdb_rating'] = np.nan
                df.loc[i, 'meta_score'] = np.nan
                df.loc[i, 'actors'] = np.nan
                df.loc[i, 'rating'] = np.nan
                df.loc[i, 'genre'] = np.nan
                df.loc[i, 'director'] = np.nan
                df.loc[i, 'release_date'] = np.nan
                df.loc[i, 'budget'] = np.nan




            else:
                imdb_score = 0
                rtom_score = 0
                metascore = 0
                if len(title_data['Ratings']) == 3:
                    for rating in title_data['Ratings']:
                        if rating['Source'] == 'Internet Movie Database':
                            imdb_score = float(rating['Value'].split('/')[0])
                        elif rating['Source'] == 'Rotten Tomatoes':

                            rtom_score = float(rating['Value'].split('%')[0])

                        elif rating['Source'] == 'Metacritic':
                            metascore = rating['Value'].split('/')[0]

                    df.loc[i, 'imdb_rating'] = imdb_score
                    df.loc[i, 'rotten_tomatoes_rating'] = rtom_score
                    df.loc[i, 'meta_score'] = metascore
                    df.loc[i, 'actors'] = title_data['Actors']
                    df.loc[i, 'rating'] = title_data['Rated']
                    df.loc[i, 'genre'] = title_data['Genre']
                    df.loc[i, 'director'] = title_data['Director']



                    if len(title_data['Released'].split()) < 2:
                        df.loc[i, 'release_date'] = np.nan
                    else:    

                        df.loc[i, 'release_date'] = title_data['Released'].split()[0] + ' ' + title_data['Released'].split()[1]

                    imdb_id = title_data['imdbID']

                    movie_tmdb_info = tmdb_request(imdb_id)
                    if "status_code" in movie_tmdb_info.keys():
                        df.loc[i, 'budget'] = np.nan
                    else:
                        budget = movie_tmdb_info['budget']
                        df.loc[i, 'budget'] = int(budget)  

                elif len(title_data['Ratings']) == 2:
                    missingid = 0
                    for rating in title_data['Ratings']:
                        if rating['Source'] == 'Internet Movie Database':
                            imdb_score = float(rating['Value'].split('/')[0])
                            missingid = 1

                        elif rating['Source'] == 'Rotten Tomatoes':

                            rtom_score = float(rating['Value'].split('%')[0])
                            missingid = 2

                        elif rating['Source'] == 'Metacritic':
                                metascore = rating['Value'].split('/')[0]
                                missingid = 3
                    if missingid == 2:

                        df.loc[i, 'imdb_rating'] = imdb_score
                        df.loc[i, 'meta_score'] = np.nan
                        df.loc[i, 'rotten_tomatoes_rating'] = rtom_score
                        df.loc[i, 'actors'] = title_data['Actors']
                        df.loc[i, 'rating'] = title_data['Rated']
                        df.loc[i, 'genre'] = title_data['Genre']
                        df.loc[i, 'director'] = title_data['Director']

                        if len(title_data['Released'].split()) < 2:
                            df.loc[i, 'release_date'] = np.nan
                        else:    

                            df.loc[i, 'release_date'] = title_data['Released'].split()[0] + ' ' + title_data['Released'].split()[1]
                        #Calling a different api to get budget 
                        movie_tmdb_info = tmdb_request(imdb_id)
                        if "status_code" in movie_tmdb_info.keys():
                            df.loc[i, 'budget'] = np.nan
                        else:
                            budget = movie_tmdb_info['budget']
                            df.loc[i, 'budget'] = int(budget)


                    elif missingid ==3:
                        df.loc[i, 'imdb_rating'] = imdb_score
                        df.loc[i, 'meta_score'] = metascore
                        df.loc[i, 'rotten_tomatoes_rating'] = np.nan
                        df.loc[i, 'actors'] = title_data['Actors']
                        df.loc[i, 'rating'] = title_data['Rated']
                        df.loc[i, 'genre'] = title_data['Genre']
                        df.loc[i, 'director'] = title_data['Director']

                        if len(title_data['Released'].split()) < 2:
                            df.loc[i, 'release_date'] = np.nan
                        else:    

                            df.loc[i, 'release_date'] = title_data['Released'].split()[0] + ' ' + title_data['Released'].split()[1]
                        #Calling a different api to get budget
                        movie_tmdb_info = tmdb_request(imdb_id)
                        if "status_code" in movie_tmdb_info.keys():
                            df.loc[i, 'budget'] = np.nan
                        else:
                            budget = movie_tmdb_info['budget']
                            df.loc[i, 'budget'] = int(budget)
        
            time.sleep(0.25) 
            print((0.25*len(df.Title))-(i*0.25))
        except:
            pass
    return df   

In [7]:
def super_df(startyear, endyear):
    years = list(range(startyear, endyear+1))
    combined_df = pd.DataFrame()
    
    time = 0
    for year in years:
        new = len(bom_db(year))
        time += (0.25*new)
    print(time)
    
    for year in years:
        temp_df = df_with_ratings(year)
        combined_df = combined_df.append(temp_df)
        print(year)
    combined_df = combined_df.reset_index(drop=True)
    
    return combined_df