# Project 3: Part 1


In [1]:
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints',
 'ratings_filtered.csv.gz',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'title_basics.csv.gz',
 'title_basics_filtered.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_2000.csv.gz',
 'tmdb_results_2001.csv.gz']

In [None]:
import pandas as pd
import numpy as np

# Load AKAs data
akas_df = pd.read_csv('Data/title-akas-us-only.csv')

# Keep only US movies
akas_df = akas_df[akas_df['region'] == 'US']

# Replace "\N" with np.nan
akas_df.replace({'\\N': np.nan}, inplace=True)

In [None]:
# Website Urls
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [None]:
# Reading the Website Urls, not including us data
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [None]:
# Load in the data
title_basics_df = basics

# Keep only US movies using AKAs table
us_movie_ids = akas_df['titleId'].unique()
title_basics_df = title_basics_df[title_basics_df['tconst'].isin(us_movie_ids)]

# Replace "\N" with np.nan (create a copy to avoid warning message)
title_basics_df = title_basics_df.copy()
title_basics_df.replace({'\\N': np.nan}, inplace=True)

# Eliminate movies with null values (using .loc indexer)
title_basics_df.dropna(subset=['runtimeMinutes', 'genres'], inplace=True)

# Keep only titleType == 'Movie'
title_basics_df = title_basics_df[title_basics_df['titleType'] == 'movie']

# Convert startYear to float data type
title_basics_df['startYear'] = title_basics_df['startYear'].astype(float)

# Filter dataframe using startYear
title_basics_df = title_basics_df[(title_basics_df['startYear'] >= 2000) & (title_basics_df['startYear'] <= 2021)]

# Eliminate movies with "Documentary" genre
title_basics_df = title_basics_df[~title_basics_df['genres'].str.contains("Documentary")]

In [None]:
# Load Ratings data
ratings_df = ratings.copy()

# Keep only movies in final title basics dataframe
final_movie_ids = title_basics_df['tconst'].unique()
ratings_df = ratings_df[ratings_df['tconst'].isin(final_movie_ids)]

# Replace "\N" with np.nan (create a copy to avoid the warning)
ratings_df.replace({'\\N': np.nan}, inplace=True)

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = title_basics_df['tconst'].isin(akas_df['titleId'])
title_basics_df = title_basics_df[keepers]

In [None]:
basics = title_basics_df[keepers]
basics

In [None]:
# Save filtered dataframes
title_basics_df.to_csv("Data/title_basics_filtered.csv.gz", compression='gzip', index=False)
ratings_df.to_csv("Data/ratings_filtered.csv.gz", compression='gzip', index=False)

# Display info summary
print("Title Basics Info:")
print(title_basics_df.info())
print("\nRatings Info:")
print(ratings_df.info())

In [None]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

In [None]:
import pandas as pd

# Merge basics and ratings dataframes on 'tconst'
merged_df = pd.merge(basics, ratings, on='tconst')

# Your existing code for data cleaning and formatting here

# MPAA Rating and Revenue Analysis
plt.figure(figsize=(10, 6))
sns.boxplot(x='MPAA Rating', y='Revenue', data=merged_df)
plt.title('MPAA Rating vs. Revenue')
plt.xlabel('MPAA Rating')
plt.ylabel('Revenue')
plt.show()