# Enhacing our data with OMDB API 

In [18]:
# Let's first import all the necessary libraries 

# For data manipulation
import pandas as pd
import numpy as np
import scipy

# For API
import requests

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress the DeprecationWarning for bar graph hues
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [6]:
# Loading our CMU movies pickle 
df_cmu_movies = pd.read_pickle("../../pickles/cmu_imdb_tmdb_merged.pkl")

# Displaying the first few rows
df_cmu_movies.head()

Unnamed: 0,Wikipedia Movie ID,Freebase Movie ID,Movie Name,Release Date,Box Office Revenue,Runtime,Language Freebase ID,Language Name,Country Freebase ID,Country Name,...,vote_count,revenue,budget,popularity,production_companies,director,writers,producers,imdb_rating,imdb_votes
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[/m/02h40lc],[English Language],[/m/09c7w0],[United States of America],...,1071.0,14010832.0,28000000.0,13.048,"Animationwerks, Screen Gems, Storm King Produc...",John Carpenter,"John Carpenter, Larry Sulkis",Sandy King,4.9,58900.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[/m/02h40lc],[English Language],[/m/09c7w0],[United States of America],...,,,,,,,,,,
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,[/m/05f_3],[Norwegian Language],[/m/05b4w],[Norway],...,1.0,0.0,0.0,1.372,"Filmeffekt AS, Norsk Film",Sølve Skagen,"Gunnar Staalesen, Sølve Skagen",Dag Alveberg,5.6,42.0
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,[/m/02h40lc],[English Language],[/m/07ssc],[United Kingdom],...,68.0,0.0,0.0,8.048,Mrs. White's Productions,Donald Cammell,"China Kong, Laurence Klavan, Donald Cammell, A...","Sue Baden-Powell, Elliott Kastner, Brad Wyman,...",6.1,3090.0
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,[/m/04306rv],[German Language],[/m/0345h],[Germany],...,13.0,0.0,0.0,2.454,Dieter Geissler Filmproduktion,Robert van Ackeren,"Robert van Ackeren, Catharina Zwerenz",Robert van Ackeren,5.9,647.0


In [9]:
df_cmu_movies.columns

Index(['Wikipedia Movie ID', 'Freebase Movie ID', 'Movie Name', 'Release Date',
       'Box Office Revenue', 'Runtime', 'Language Freebase ID',
       'Language Name', 'Country Freebase ID', 'Country Name',
       'Genre Freebase ID', 'Genre Name', 'IMDb_ID', 'vote_average',
       'vote_count', 'revenue', 'budget', 'popularity', 'production_companies',
       'director', 'writers', 'producers', 'imdb_rating', 'imdb_votes'],
      dtype='object')

In [12]:
df_cmu_movies['Box Office Revenue'].head()

0    14010832.0
1           NaN
2           NaN
3           NaN
4           NaN
Name: Box Office Revenue, dtype: float64

In [None]:
# Defining the OMDb API URL and API Key
OMDB_API_URL = "https://www.omdbapi.com/"
API_KEY = "dc87251a"

# Cleaning Box Office values
def clean_box_office(value):
    if isinstance(value, str):
        return float(value.replace('$', '').replace(',', ''))
    return value

# Updating movie data from OMDb API
def update_movie_data(df):
    total_rows_counter = 0  # Counter for the number of rows processed

    for index, row in df.iterrows():
        total_rows_counter += 1  # Increment the row counter
        if total_rows_counter % 500 == 0:
            print(f"Processed {total_rows_counter} rows so far")

        # Skipping rows where IMDb_ID is missing
        imdb_id = row['IMDb_ID']
        if pd.isna(imdb_id):
            continue

        # Checking if any of the fields need updating
        needs_update = (
            pd.isna(row['Box Office Revenue']) or 
            pd.isna(row['Runtime']) or 
            pd.isna(row['vote_average'])
        )

        # Skipping if all fields are already populated
        if not needs_update:
            continue  

        # Setting up API parameters
        params = {
            'apikey': API_KEY,
            'i': imdb_id
        }

        try:
            # Making the API request
            response = requests.get(OMDB_API_URL, params=params)

            # Checking if the response is valid JSON and not empty
            try:
                data = response.json()
            except ValueError:
                print(f"Row {total_rows_counter}: Invalid JSON response for {imdb_id}. Skipping.")
                continue

            # Checking if the response indicates success
            if not data or data.get('Response') != 'True':
                print(f"Row {total_rows_counter}: No valid data for {imdb_id}. Skipping.")
                continue

            # Updating Box Office revenue
            if pd.isna(row['Box Office Revenue']) and 'BoxOffice' in data:
                box_office = data.get('BoxOffice')
                if box_office and box_office != "N/A":
                    df.at[index, 'Box Office Revenue'] = clean_box_office(box_office)

            # Updating Runtime
            if pd.isna(row['Runtime']) and 'Runtime' in data:
                runtime = data.get('Runtime')
                if runtime and runtime != "N/A":
                    minutes = ''.join(filter(str.isdigit, runtime))
                    df.at[index, 'Runtime'] = int(minutes) if minutes else np.nan

            # Updating vote_average
            if pd.isna(row['vote_average']) and 'imdbRating' in data:
                imdb_rating = data.get('imdbRating')
                if imdb_rating and imdb_rating != "N/A":
                    df.at[index, 'vote_average'] = float(imdb_rating)

        except requests.exceptions.RequestException as e:
            print(f"Row {total_rows_counter}: Request failed for {imdb_id}: {e}")
            continue  # Skipping to the next movie

    print(f"Processing complete: Total rows processed = {total_rows_counter}")
    return df

In [46]:
# Testing before running on big data
test_data = {
    'Movie Name': ['Ghosts of Mars'],
    'IMDb_ID': ['tt0228333'],
    'Box Office Revenue': [np.nan],
    'Runtime': [np.nan],
    'vote_average': [np.nan]
}

df_test = pd.DataFrame(test_data)

print("Initial DataFrame:")
print(df_test)

Initial DataFrame:
       Movie Name    IMDb_ID  Box Office Revenue  Runtime  vote_average
0  Ghosts of Mars  tt0228333                 NaN      NaN           NaN


In [47]:
df_test = update_movie_data(df_test)
print("Updated DataFrame:")
print(df_test)

Updated DataFrame:
       Movie Name    IMDb_ID  Box Office Revenue  Runtime  vote_average
0  Ghosts of Mars  tt0228333           8709640.0     98.0           4.9


In [48]:
#Running API 
df_cmu_movies = update_movie_data(df_cmu_movies)

Done 500
Done 1000
Done 1500
Done 2000
Done 2500
No valid response for tt7982466: Error getting data.
Done 3000
Done 3500
Done 4000
Done 4500
Done 5000
No valid response for tt7825630: Error getting data.
Done 5500
Done 6000
Done 6500
Done 7000
Done 7500
Done 8000
Done 8500
Done 9000
Done 9500
Done 10000
Done 10500
No valid response for tt0832278: Error getting data.
Done 11000
Done 11500
Done 12000
Done 12500
Done 13000
Done 13500
No valid response for tt1067580: Error getting data.
Done 14000
Done 14500
Done 15000
Done 15500
Done 16000
Done 16500
Done 17000
Done 17500
No valid response for tt33812162: Error getting data.
Done 18000
Done 18500
Done 19000
Done 19500
Done 20000
Done 20500
Done 21000
Done 21500
Request failed for nm1016169: Expecting ',' delimiter: line 1 column 54 (char 53)
Done 22000
Done 22500
No valid response for tt26768638: Error getting data.
Done 23000
Done 23500
Done 24000
Done 24500
Done 25000
Done 25500
Done 26000
Done 26500


KeyboardInterrupt: 

In [50]:
null_box_office_count = df_cmu_movies['Box Office Revenue'].isna().sum()
print(null_box_office_count)

71106


In [49]:
df_cmu_test = pd.read_pickle("../../pickles/cmu_imdb_tmdb_merged.pkl")
null_box_office_count = df_cmu_test['Box Office Revenue'].isna().sum()
print(null_box_office_count)

73452


In [23]:
print(df_cmu_movies[['Movie Name', 'Box Office Revenue', 'Runtime', 'vote_average']].head())

                                          Movie Name  Box Office Revenue  \
0                                     Ghosts of Mars          14010832.0   
1  Getting Away with Murder: The JonBenét Ramsey ...                 NaN   
2                                        Brun bitter                 NaN   
3                                   White Of The Eye            225132.0   
4                                  A Woman in Flames                 NaN   

   Runtime  vote_average  
0     98.0           5.1  
1     95.0           NaN  
2     83.0           7.0  
3    110.0           5.8  
4    106.0           5.3  
