# Project 3: Part 1


In [1]:
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'title_basics.csv.gz']

In [2]:
import pandas as pd
import numpy as np

# Load AKAs data
akas_df = pd.read_csv('Data/title-akas-us-only.csv')

# Keep only US movies
akas_df = akas_df[akas_df['region'] == 'US']

# Replace "\N" with np.nan
akas_df.replace({'\\N': np.nan}, inplace=True)

  akas_df = pd.read_csv('Data/title-akas-us-only.csv')


In [5]:
# Website Urls
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [6]:
# Reading the Website Urls, not including us data
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [8]:
# Load in the data
title_basics_df = basics

# Keep only US movies using AKAs table
us_movie_ids = akas_df['titleId'].unique()
title_basics_df = title_basics_df[title_basics_df['tconst'].isin(us_movie_ids)]

# Replace "\N" with np.nan (create a copy to avoid warning message)
title_basics_df = title_basics_df.copy()
title_basics_df.replace({'\\N': np.nan}, inplace=True)

# Eliminate movies with null values (using .loc indexer)
title_basics_df.dropna(subset=['runtimeMinutes', 'genres'], inplace=True)

# Keep only titleType == 'Movie'
title_basics_df = title_basics_df[title_basics_df['titleType'] == 'movie']

# Convert startYear to float data type
title_basics_df['startYear'] = title_basics_df['startYear'].astype(float)

# Filter dataframe using startYear
title_basics_df = title_basics_df[(title_basics_df['startYear'] >= 2000) & (title_basics_df['startYear'] <= 2021)]

# Eliminate movies with "Documentary" genre
title_basics_df = title_basics_df[~title_basics_df['genres'].str.contains("Documentary")]

In [9]:
# Load Ratings data
ratings_df = ratings.copy()

# Keep only movies in final title basics dataframe
final_movie_ids = title_basics_df['tconst'].unique()
ratings_df = ratings_df[ratings_df['tconst'].isin(final_movie_ids)]

# Replace "\N" with np.nan (create a copy to avoid the warning)
ratings_df.replace({'\\N': np.nan}, inplace=True)

In [10]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = title_basics_df['tconst'].isin(akas_df['titleId'])
title_basics_df = title_basics_df[keepers]

In [11]:
basics = title_basics_df[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61112,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67486,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama
67664,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86791,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
10102949,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
10103343,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019.0,,97,"Comedy,Drama,Fantasy"
10103483,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
10103492,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [12]:
# Save filtered dataframes
title_basics_df.to_csv("Data/title_basics_filtered.csv.gz", compression='gzip', index=False)
ratings_df.to_csv("Data/ratings_filtered.csv.gz", compression='gzip', index=False)

# Display info summary
print("Title Basics Info:")
print(title_basics_df.info())
print("\nRatings Info:")
print(ratings_df.info())

Title Basics Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 81889 entries, 34800 to 10103576
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81889 non-null  object 
 1   titleType       81889 non-null  object 
 2   primaryTitle    81889 non-null  object 
 3   originalTitle   81889 non-null  object 
 4   isAdult         81889 non-null  object 
 5   startYear       81889 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  81889 non-null  object 
 8   genres          81889 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.2+ MB
None

Ratings Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 68046 entries, 17882 to 1340938
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         68046 non-null  object 
 1   averageRating  68046 non-null  float64
 2   numV

In [None]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()