In [31]:



def get_movie_with_rating(movie_id):
    """Adapted from source = https://github.com/celiao/tmdbsimple"""
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    
    # save the .info .releases dictionaries
    info = movie.info()
    
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if c['iso_3166_1'] == 'US':
        ## save a 'certification' key in info with the certification
        info['certification'] = c['certification']
        
        
    return info
    


In [32]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)


In [33]:
def read_and_fix_json(JSON_FILE):
    """Attempts to read in json file of records and fixes the final character
    to end with a ] if it errors.
    
    Args:
        JSON_FILE (str): filepath of JSON file
        
    Returns:
        DataFrame: the corrected data from the bad json file
    """
    try: 
        previous_df =  pd.read_json(JSON_FILE)
    
    ## If read_json throws an error
    except:
        
        ## manually open the json file
        with open(JSON_FILE,'r+') as f:
            ## Read in the file as a STRING
            bad_json = f.read()
            
            ## if the final character doesn't match first, select the right bracket
            first_char = bad_json[0]
            final_brackets = {'[':']', 
                           "{":"}"}
            ## Select expected final brakcet
            final_char = final_brackets[first_char]
            
            ## if the last character in file doen't match the first char, add it
            if bad_json[-1] != final_char:
                good_json = bad_json[:-1]
                good_json+=final_char
            else:
                raise Exception('ERROR is not due to mismatched final bracket.')
            
            ## Rewind to start of file and write new good_json to disk
            f.seek(0)
            f.write(good_json)
           
        ## Load the json file again now that its fixed
        previous_df =  pd.read_json(JSON_FILE)
        
    return previous_df
	
	

In [5]:
import pandas as pd
import numpy as np

# example making new folder with os
import os, time,json
import tmdbsimple as tmdb 
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus


In [30]:
# Load the three files into respective dataframes
url_akas = "https://datasets.imdbws.com/title.akas.tsv.gz"
url_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
url_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"

akas_df = pd.read_csv(url_akas, compression='gzip', sep='\t', low_memory=False)
basics_df = pd.read_csv(url_basics, compression='gzip', sep='\t', low_memory=False)
ratings_df = pd.read_csv(url_ratings, compression='gzip', sep='\t', low_memory=False)

# Display the first few rows of each dataframe to understand their structure
akas_df.head(), basics_df.head(), ratings_df.head()


(     titleId  ordering                      title region language  \
 0  tt0000001         1                 Карменсіта     UA       \N   
 1  tt0000001         2                 Carmencita     DE       \N   
 2  tt0000001         3  Carmencita - spanyol tánc     HU       \N   
 3  tt0000001         4                 Καρμενσίτα     GR       \N   
 4  tt0000001         5                 Карменсита     RU       \N   
 
          types     attributes isOriginalTitle  
 0  imdbDisplay             \N               0  
 1           \N  literal title               0  
 2  imdbDisplay             \N               0  
 3  imdbDisplay             \N               0  
 4  imdbDisplay             \N               0  ,
       tconst titleType            primaryTitle           originalTitle  \
 0  tt0000001     short              Carmencita              Carmencita   
 1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
 2  tt0000003     short          Pauvre Pierrot          P

In [None]:
akas_df = akas_df[(akas_df['region'] == 'US')]

In [None]:
akas_df.replace({'\\N': np.nan}, inplace=True)

# Display the first few rows of the processed dataframe
akas_df.head()



In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers1 =basics_df['tconst'].isin(akas_df['titleId'])
keepers1



In [None]:
basics_df.replace({'\\N': np.nan}, inplace=True)

# Display the first few rows of the processed dataframe
basics_df.head()
basics_df.info()


In [None]:
basics_df = basics_df[keepers1]
basics_df



In [None]:
basics_df = basics_df[basics_df['runtimeMinutes'].notna()]

basics_df = basics_df[basics_df['genres'].notna()]

basics_df = basics_df[basics_df.titleType == 'movie']

basics_df = basics_df[basics_df['startYear'].notna()]

basics_df['startYear'] = basics_df['startYear'].astype(float)

basics_df.dtypes

In [None]:
basics_df

In [None]:
# Filtering the basics dataframe using startYear column to keep movies between 2000 and 2021 inclusive
basics_df = basics_df[(basics_df['startYear'] >= 2000) & (basics_df['startYear'] <= 2021)]
basics_df.info()

In [None]:
# Exclude movies that are included in the documentary category.
is_documentary = basics_df['genres'].str.contains('documentary',case=False)
basics_df = basics_df[~is_documentary]
basics_df.info()

In [None]:
ratings_df.replace({'\\N': np.nan}, inplace=True)

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers2 =ratings_df['tconst'].isin(basics_df['tconst'])
keepers2


In [None]:
ratings_df = ratings_df[keepers2]
ratings_df

In [None]:
# Save Dataframe
akas_df.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)

# Open saved file
akas_df = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
akas_df.head()
akas_df.info() # resubmit added


In [None]:
basics_df.to_csv("Data/title_basics.csv.gz", compression='gzip' , index=False)

#open saved file
basics_df = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics_df.head()
basics_df.info() #resubmit added


In [None]:
ratings_df.to_csv("Data/title_ratings.csv.gz", compression='gzip' , index=False)
#open saved file
ratings_df = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
ratings_df.head()
ratings_df.info() #resubmit added

## Using API

In [None]:
import json
with open('/Users/corycates/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()



In [None]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']



In [None]:
movie.info()

In [None]:
info['budget']


In [None]:
info['revenue']


In [None]:
info['imdb_id']



In [None]:
# TEST FUNCTION FOR avengers
test = get_movie_with_rating("tt0848228") #put your function name here
test





In [None]:
# TEST FUNCTION FOR NOTEBOOK
test = get_movie_with_rating("tt0332280") #put your function name here
test

In [None]:
YEARS_TO_GET = [2000,2001]

YEARS_TO_GET

In [None]:
# Error list to reference later after the loops
errors = [ ]



In [None]:
from tqdm.notebook import tqdm_notebook
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    # Some code to execute for each YEAR
    print(YEAR)


In [None]:
#Defining the JSON file to store results for year
JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'



In [None]:
# Check if file exists
file_exists = os.path.isfile(JSON_FILE)



In [None]:
# If it does not exist: create it
if file_exists == False:
# save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)



In [None]:
#Saving new year as the current df
current_df = basics_df.loc[ basics_df['startYear']==YEAR].copy()
# saving movie ids to list
movie_ids = df['tconst'].copy()

current_df.head(5)




In [None]:
# Load existing data from json into a dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)



In [None]:
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]



## Start of the Inner Loop

In [None]:
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])



## After the Loop

In [None]:
final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)



In [None]:
print(f"- Total errors: {len(errors)}")



In [None]:
# Instead of previous_df=pd.read_json:
previous_df = read_and_fix_json(JSON_FILE)



# Part 3 Observation:  All years are appended to one file (2000 and 2001).  To get both years, the for loop needs to be altered to reflect both.  

In [None]:
# Create connection string using credentials following this format
# connection = "dialect+driver://username:password@host:port/database"
Movies = "mysql+pymysql://root:password@localhost:3306/Movies"

In [None]:
engine = create_engine(Movies)

engine

In [None]:
# Check if the database exists. If not, create it.
if database_exists(Movies) == False:
  create_database(Movies)
else:
  print('The database already exists')

In [None]:
# Read the files
tmdb_data = pd.read_csv("Data/final_tmdb_data_2001.csv.gz", compression='gzip')
title_ratings = pd.read_csv("Data/title.ratings.tsv.gz",  compression='gzip', sep=',', low_memory=False)
title_basics = pd.read_csv("Data/title.basics.csv.gz", compression='gzip')

tmdb_data.head(), title_ratings.head(), title_basics.head()

In [None]:
# Step 1: Getting a List of Unique Genres

# Convert genre strings into lists in a new 'genres_split' column for basics_df
title_basics['genres_split'] = title_basics['genres'].str.split(',')

# Explode the lists into new rows
exploded_genres_basics = title_basics.explode('genres_split')

# Identify and save the unique genres, sorted alphabetically
unique_genres_basics = sorted(exploded_genres_basics['genres_split'].unique())



In [None]:
# Save just the tconst and genres_split as new df
title_genres = exploded_genres[['tconst', 'genres_split']].copy()
title_genres.head()

In [None]:
## Making the genre mapper dictionary
genre_ints = range(len(unique_genres_basics))
genre_map = dict(zip(unique_genres_basics, genre_ints))
genre_map



In [None]:
## make new integer genre_id and drop string genres
title_genres['genre_id'] = title_genres['genres_split'].map(genre_map)
title_genres = title_genres.drop(columns='genres_split')



In [None]:
# Create a DataFrame from the genre_map_basics dictionary
genres = pd.DataFrame({'genre_name': list(genre_map_basics.keys()), 'genre_id': list(genre_map_basics.values())})

genres.head()


In [None]:
# Drop the unnecessary columns from title_basics
title_basics = title_basics.drop(columns=['originalTitle', 'isAdult', 'titleType', 'genres'])

title_basics.head()


### II) Saving the MySQL tables with tconst as the primary key.

In [None]:
## Set the dataframe index and use index=True 
title_genres.set_index('genre_id').to_sql('title_genres',engine,index=True)


In [None]:
genres.set_index('genre_id').to_sql('genres',engine,index=True)

In [None]:
# Getting BLob error...will try an alternative
#tmdb_data[['imdb_id', 'revenue', 'budget', 'certification']].set_index('imdb_id').to_sql('tmdb_data', engine, if_exists='replace', index=True)
# Define the data types

import sqlalchemy

# Define the data types
data_types = {
    'imdb_id': sqlalchemy.types.VARCHAR(length=255),
    'revenue': sqlalchemy.types.Float,
    'budget': sqlalchemy.types.Float,
    'certification': sqlalchemy.types.VARCHAR(length=255)  # Specify a length here
}

# Use the to_sql function with the dtype parameter
tmdb_data[['imdb_id', 'revenue', 'budget', 'certification']].set_index('imdb_id').to_sql(
    'tmdb_data', 
    engine, 
    if_exists='replace', 
    index=True, 
    dtype=data_types
)


In [None]:
## get max string length
max_str_len = title_basics['tconst'].fillna('').map(len).max()



In [None]:

from sqlalchemy.types import *
## Calculate max string lengths for object columns
key_len = title_basics['tconst'].fillna('').map(len).max()
title_len = title_basics['primaryTitle'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
df_schema = {
    "tconst": String(key_len+1), 
    "primaryTitle": Text(title_len+1),
    'startYear':Float(),
    'runtimeMinutes':Integer()}



In [None]:
# Save to sql with dtype and index=False
title_basics.to_sql('title_basics',engine,dtype=df_schema,if_exists='replace',index=False)


In [None]:
engine.execute('ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);')



In [None]:
## get max string length
max_str_len = title_ratings['tconst'].fillna('').map(len).max()


In [None]:

from sqlalchemy.types import *
## Calculate max string lengths for object columns
key_len = title_ratings['tconst'].fillna('').map(len).max()
#title_len = title_ratings['primaryTitle'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
df_schema = {
    "tconst": String(key_len+1), 
    'numVotes':Float(),
    'averageRating':Integer()}
    


In [None]:
# Save to sql with dtype and index=False
title_ratings.to_sql('title_ratings',engine,dtype=df_schema,if_exists='replace',index=False)


In [None]:
engine.execute('ALTER TABLE title_ratings ADD PRIMARY KEY (`tconst`);')

In [None]:

genres.head(5)

In [None]:
title_basics.head(5)

In [None]:
title_genres.head(5)

In [None]:
title_ratings.head(5)


In [None]:
tmdb_data.head()

In [None]:
q = """SHOW TABLES;"""
pd.read_sql(q, engine)


# Part 4

In [39]:
import json
with open('/Users/corycates/.secret/tmdb_api.json', 'r') as file:
    login = json.load(file)
## Display the keys of the loaded dict
login.keys()



dict_keys(['api-key'])

In [40]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']


In [41]:
## make a movie object using the .Movies function from tmdb
movie = tmdb.Movies(603)


In [42]:
## movie objects have a .info dictionary 
info = movie.info()
info



{'adult': False,
 'backdrop_path': '/oMsxZEvz9a708d49b6UdZK1KAo5.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 73.051,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 372,
   'logo_path': None,
   'name': 'Groucho II Film

## Data Analysis on Movie Revenue

### Hypothesis 1: 
#### **Question**: 
Does the popularity of a movie affect how much revenue the movie generates?

- **Null Hypothesis (H0)**: The popularity of a movie does not have any effect on the revenue it generates.
- **Alternative Hypothesis (Ha)**: The popularity of a movie does affect the revenue it generates.

**Assumptions**:
- Revenue data is accurately reported and has no missing values.
- Movies with higher popularity scores are not necessarily more popular; they just have different content guidelines.
- The sample is representative of the general trend in the movie industry over the selected period.

---

### Hypothesis 2 (Modified based on available data):
#### **Question**: 
Do movies with higher vote averages earn more revenue?

- **Null Hypothesis (H0)**: The vote average of a movie does not have any effect on the revenue it generates.
- **Alternative Hypothesis (Ha)**: Movies with higher vote averages earn different revenue than movies with lower vote averages.

**Assumptions**:
- Vote average is accurately reported.
- Movies with higher vote averages do not necessarily have better content; they're just rated higher.
- The sample represents movies from various genres, production companies, and target audiences.

---

### Hypothesis 3:
#### **Question**: 
Do movies released in 2020 earn less revenue than movies released in 2018?

- **Null Hypothesis (H0)**: Release year (whether 2020 or 2018) does not affect the revenue of movies.
- **Alternative Hypothesis (Ha)**: Movies released in 2020 earn different revenue than movies released in 2018.

**Assumptions**:
- Release year is accurately reported.
- Movies from both years had similar opportunities for marketing and promotions.
- External factors, such as global events or market dynamics, are considered when comparing revenues.


In [43]:
import pandas as pd

# Define the path to the JSON file
json_file_path = '/Users/corycates/Documents/GitHub/Data_Enrichment/Data1/tmdb_api_proj4_2000 - 2022.json'

# Load the JSON data into a Pandas DataFrame
try:
    all_movies_df = pd.read_json(json_file_path)
    print("Data loaded successfully.")
except Exception as e:
    print(f"An error occurred while loading the data: {str(e)}")


Data loaded successfully.


In [45]:
import requests
import csv
import os

# Replace with your API key
API_KEY = "ffb1d4f5fc2d9aeede7ea943b2b70db0"

# Base URL for fetching movie data
BASE_URL = "https://api.themoviedb.org/3/discover/movie"

# Directory to store the CSV files
DIRECTORY = "/Users/corycates/Documents/GitHub/Data_Enrichment/Data"

# Specify the fields you want from the API
# You can customize this list based on the columns you need
FIELDS = [
    "title",
    "overview",
    "release_date",
    "popularity",
    "revenue",  # Include revenue field
    "vote_average",  # Include ratings field (vote_average)
]

# Function to fetch movies for a given year and save them as CSV with selected fields
def fetch_movies_for_year(year):
    params = {
        "api_key": API_KEY,
        "primary_release_year": year,
        "sort_by": "popularity.desc",
        "fields": ",".join(FIELDS),  # Convert the list of fields to a comma-separated string
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code == 200:
        movies = response.json().get("results", [])
        csv_filename = os.path.join(DIRECTORY, f"movies_{year}.csv")

        with open(csv_filename, "w", newline="", encoding="utf-8") as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=FIELDS)
            writer.writeheader()
            for movie in movies:
                writer.writerow({field: movie.get(field, "") for field in FIELDS})

        print(f"Movies for {year} fetched and saved as CSV.")
    else:
        print(f"Error fetching movies for {year}: {response.text}")

# Create the directory if it doesn't exist
os.makedirs(DIRECTORY, exist_ok=True)

# Fetch movies for the years 2000 to 2022 with selected fields and save as CSV
for year in range(2000, 2023):
    fetch_movies_for_year(year)



Movies for 2000 fetched and saved as CSV.
Movies for 2001 fetched and saved as CSV.
Movies for 2002 fetched and saved as CSV.
Movies for 2003 fetched and saved as CSV.
Movies for 2004 fetched and saved as CSV.
Movies for 2005 fetched and saved as CSV.
Movies for 2006 fetched and saved as CSV.
Movies for 2007 fetched and saved as CSV.
Movies for 2008 fetched and saved as CSV.
Movies for 2009 fetched and saved as CSV.
Movies for 2010 fetched and saved as CSV.
Movies for 2011 fetched and saved as CSV.
Movies for 2012 fetched and saved as CSV.
Movies for 2013 fetched and saved as CSV.
Movies for 2014 fetched and saved as CSV.
Movies for 2015 fetched and saved as CSV.
Movies for 2016 fetched and saved as CSV.
Movies for 2017 fetched and saved as CSV.
Movies for 2018 fetched and saved as CSV.
Movies for 2019 fetched and saved as CSV.
Movies for 2020 fetched and saved as CSV.
Movies for 2021 fetched and saved as CSV.
Movies for 2022 fetched and saved as CSV.


In [25]:

# Step 1: Using glob to get a list of JSON files
json_files = glob.glob("/Users/corycates/Documents/GitHub/Data_Enrichment/Data/movies_*.json")

# Step 2: Load the JSON data into DataFrames
dataframes = [pd.read_json(json_file) for json_file in json_files]

# Step 3: Merge the DataFrames into one comprehensive DataFrame
all_movies_df = pd.concat(dataframes, ignore_index=True)

# Display the first few rows and column names
print("First few rows of the DataFrame:")
print(all_movies_df.head())
print("\nColumn names:")
print(all_movies_df.columns)

# Now you have a single DataFrame 'all_movies_df' containing data from 2000 to 2022.


First few rows of the DataFrame:
   adult                     backdrop_path                    genre_ids  \
0  False  /2u7zbn8EudG6kLlBzUYqP8RyFU4.jpg                 [12, 14, 28]   
1  False  /7yxjg8pvp3JuqguUaJPYTma6Z7t.jpg                 [12, 14, 28]   
2  False  /1Q1tAM49hoT3Hsj2kpx8O34kG01.jpg  [10751, 16, 12, 35, 14, 28]   
3  False  /605ueaRtnDz4Lj3CUluW9wbbh4x.jpg                  [16, 10751]   
4  False  /8Id5xQr54BCdVRDQM9i0s8P9BUw.jpg                  [10749, 18]   

      id original_language                                     original_title  \
0    122                en      The Lord of the Rings: The Return of the King   
1     22                en  Pirates of the Caribbean: The Curse of the Bla...   
2  14411                en                   Sinbad: Legend of the Seven Seas   
3     12                en                                       Finding Nemo   
4  75432                ko                                      맛있는 섹스 그리고 사랑   

                             

## New heading

In [50]:
# Merge the movies and ratings dataframes on the common column 'imdb_id'
merged_df = movies_df.merge(ratings_df, on='imdb_id', how='left')

# Now you can visualize the distribution of revenue based on MPAA rating
plt.figure(figsize=(12, 6))
sns.boxplot(data=merged_df, x='mpaa_rating', y='revenue')
plt.title('Revenue vs. MPAA Rating')
plt.xlabel('MPAA Rating')
plt.ylabel('Revenue')
plt.show()


NameError: name 'movies_df' is not defined

In [37]:
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1993
1,tt0000002,5.8,267
2,tt0000003,6.5,1875
3,tt0000004,5.5,177
4,tt0000005,6.2,2658
