In [1]:
import pandas as pd
import numpy as np
import os, time,json
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

import warnings
warnings.filterwarnings('ignore')

In [2]:
pip install --upgrade pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
# example making new folder with os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints',
 'basics_compressed.csv.gz',
 'basics_compressed_cleaned.csv.gz',
 'ratings_compressed.csv.gz',
 'ratings_compressed_cleaned.csv.gz',
 'title-akas-us-only.csv',
 'title.basics (1).tsv.gz',
 'title.ratings (1).tsv.gz']

# Load Data

In [4]:
#Import title basics dataset
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)


In [5]:
# Import akas dataset
akas_url = r"C:\Users\admin\Documents\GitHub\IMDB-Movies-Analysis\Data\title-akas-us-only.csv"


akas = pd.read_csv(akas_url, low_memory= False)

In [6]:
# Import ratings dataset
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

ratings = pd.read_csv(ratings_url, sep="\t", low_memory= False)

In [8]:
# Create a copy of each dataset
basics2 = basics.copy()
akas2 = akas.copy()
ratings2 = ratings.copy()


# Explore Data

In [9]:
basics.head(15)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,1,"Documentary,Short"
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,\N,1,"Documentary,Short"


In [10]:
ratings.head(30)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1990
1,tt0000002,5.8,265
2,tt0000003,6.5,1868
3,tt0000004,5.5,177
4,tt0000005,6.2,2655
5,tt0000006,5.0,182
6,tt0000007,5.4,830
7,tt0000008,5.4,2132
8,tt0000009,5.3,206
9,tt0000010,6.9,7265


In [11]:
akas.head(15)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
5,tt0000005,7,Blacksmithing,US,\N,\N,informal alternative title,0
6,tt0000006,3,Chinese Opium Den,US,\N,imdbDisplay,\N,0
7,tt0000007,1,Corbett and Courtney Before the Kinetograph,US,\N,imdbDisplay,\N,0
8,tt0000007,5,The Corbett-Courtney Fight,US,\N,alternative,\N,0
9,tt0000007,6,Jim Corbett vs. Peter Courtney,US,\N,alternative,\N,0


In [12]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10125398 entries, 0 to 10125397
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 695.3+ MB


In [13]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452562 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [14]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1344397 entries, 0 to 1344396
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1344397 non-null  object 
 1   averageRating  1344397 non-null  float64
 2   numVotes       1344397 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.8+ MB


# Preprocessing Data for Analysis

In [15]:
# Replace "\N" with np.nan
akas.replace("\\N", np.nan, inplace=True)
basics.replace("\\N", np.nan, inplace=True)
ratings.replace("\\N", np.nan, inplace=True)

In [16]:
# View null values for akas
akas.isna().sum().sum()

3326324

In [17]:
# View columns with null values
akas.isna().sum()

titleId                  0
ordering                 0
title                    2
region                   0
language           1448546
types               470886
attributes         1405548
isOriginalTitle       1342
dtype: int64

In [18]:
# View null values for basics
basics.isna().sum().sum()

18924719

In [19]:
# View columns with null values
basics.isna().sum()

tconst                   0
titleType                0
primaryTitle            17
originalTitle           17
isAdult                  1
startYear          1360165
endYear           10013536
runtimeMinutes     7095152
genres              455831
dtype: int64

In [21]:
# View null values for ratings
ratings.isna().sum().sum()

0

In [22]:
# View columns with null values
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [23]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(akas['titleId'])
keepers

0            True
1            True
2           False
3           False
4            True
            ...  
10125393    False
10125394    False
10125395    False
10125396    False
10125397    False
Name: tconst, Length: 10125398, dtype: bool

In [24]:
# Filter the basics DataFrame to keep only titles also listed in the US filtered akas
basics = basics[keepers]

In [25]:
# View change in basics dataset
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1365255 entries, 0 to 10125353
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   tconst          1365255 non-null  object
 1   titleType       1365255 non-null  object
 2   primaryTitle    1365253 non-null  object
 3   originalTitle   1365253 non-null  object
 4   isAdult         1365255 non-null  object
 5   startYear       1267566 non-null  object
 6   endYear         37423 non-null    object
 7   runtimeMinutes  863498 non-null   object
 8   genres          1336849 non-null  object
dtypes: object(9)
memory usage: 104.2+ MB


In [26]:
# Drop NAs in runtimeMinutes, genres, startYear columns
basics = basics.dropna(subset=['runtimeMinutes', 'genres', 'startYear'])

In [27]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 845768 entries, 0 to 10125259
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          845768 non-null  object
 1   titleType       845768 non-null  object
 2   primaryTitle    845767 non-null  object
 3   originalTitle   845767 non-null  object
 4   isAdult         845768 non-null  object
 5   startYear       845768 non-null  object
 6   endYear         20432 non-null   object
 7   runtimeMinutes  845768 non-null  object
 8   genres          845768 non-null  object
dtypes: object(9)
memory usage: 64.5+ MB


In [28]:
#Keep only movies where titleType==Movie
basics = basics.loc[basics['titleType'] == 'movie']

In [29]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200301 entries, 8 to 10125164
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          200301 non-null  object
 1   titleType       200301 non-null  object
 2   primaryTitle    200300 non-null  object
 3   originalTitle   200300 non-null  object
 4   isAdult         200301 non-null  object
 5   startYear       200301 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  200301 non-null  object
 8   genres          200301 non-null  object
dtypes: object(9)
memory usage: 15.3+ MB


In [30]:
# Replace startYear '\\N' with NaN
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')


In [31]:
#Convert the startYear column to float data type
basics['startYear'] = basics['startYear'].astype(float)

In [32]:
#Filter the dataframe using startYear. Keep years between 2000-2021
basics = basics[basics["startYear"]>= 2000]

In [33]:
#Filter the dataframe using startYear. Keep years between 2000-2021
basics = basics[basics["startYear"]<= 2021]

In [34]:
#Eliminate movies that include "Documentary" in the genre
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]


In [35]:
basics.info()


<class 'pandas.core.frame.DataFrame'>
Index: 81882 entries, 34800 to 10125164
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81882 non-null  object 
 1   titleType       81882 non-null  object 
 2   primaryTitle    81881 non-null  object 
 3   originalTitle   81881 non-null  object 
 4   isAdult         81882 non-null  object 
 5   startYear       81882 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  81882 non-null  object 
 8   genres          81882 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.2+ MB


In [36]:
# Include the US by using the filter akas dataframe
ratings_keepers =ratings['tconst'].isin(basics['tconst'])
ratings_keepers


0          False
1          False
2          False
3          False
4          False
           ...  
1344392    False
1344393    False
1344394    False
1344395    False
1344396    False
Name: tconst, Length: 1344397, dtype: bool

In [38]:
# apply filter to ratings data
ratings = ratings[ratings_keepers]

ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68056 entries, 17893 to 1344368
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         68056 non-null  object 
 1   averageRating  68056 non-null  float64
 2   numVotes       68056 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.1+ MB


# Modified Datasets

In [39]:
# View modified basics dataset
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81882 entries, 34800 to 10125164
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81882 non-null  object 
 1   titleType       81882 non-null  object 
 2   primaryTitle    81881 non-null  object 
 3   originalTitle   81881 non-null  object 
 4   isAdult         81882 non-null  object 
 5   startYear       81882 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  81882 non-null  object 
 8   genres          81882 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.2+ MB


In [40]:
# Calculate null value sums and row counts for original dataset
null_counts_basics2 = basics2.isna().sum().sum()
row_count_basics2 = len(basics2)

# Calculate null value sums and row counts for modified dataset
null_counts_basics = basics.isna().sum().sum()
row_count_basics = len(basics)

# Calculate the differences
null_count_diff = null_counts_basics2 - null_counts_basics
row_count_diff = row_count_basics2 - row_count_basics

# Print out the differences
print(f"There were {null_count_diff} null values modified and {row_count_diff} rows eliminated from the original dataset.")

There were -81833 null values modified and 10043516 rows eliminated from the original dataset.


In [41]:
# View modified akas dataset
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452562 non-null  object
 3   region           1452564 non-null  object
 4   language         4018 non-null     object
 5   types            981678 non-null   object
 6   attributes       47016 non-null    object
 7   isOriginalTitle  1451222 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [42]:
# Calculate null value sums and row counts for original dataset
null_counts_akas2 = akas2.isna().sum().sum()
row_count_akas2 = len(akas2)

# Calculate null value sums and row counts for modified dataset
null_counts_akas = akas.isna().sum().sum()
row_count_akas = len(akas)

# Calculate the differences
null_count_diff_akas = null_counts_akas2 - null_counts_akas
row_count_diff_akas = row_count_akas2 - row_count_akas

# Print out the differences
print("There were {} null values modified and {} rows eliminated from the original dataset.".format(
    null_count_diff_akas.sum(), row_count_diff_akas))

There were -3326322 null values modified and 0 rows eliminated from the original dataset.


In [43]:
# View modified ratings dataset
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68056 entries, 17893 to 1344368
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         68056 non-null  object 
 1   averageRating  68056 non-null  float64
 2   numVotes       68056 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.1+ MB


In [44]:
# Calculate null value sums and row counts for original dataset
null_counts_ratings2 = ratings2.isna().sum().sum()
row_count_ratings2 = len(ratings2)

# Calculate null value sums and row counts for modified dataset
null_counts_ratings = ratings.isna().sum().sum()
row_count_ratings = len(ratings)

# Calculate the differences
null_count_diff_ratings = null_counts_ratings2 - null_counts_ratings
row_count_diff_ratings = row_count_ratings2 - row_count_ratings

# Print out the differences
print("There were {} null values modified and {} rows eliminated from the original dataset.".format(
    null_count_diff_ratings.sum(), row_count_diff_ratings))


There were 0 null values modified and 1276341 rows eliminated from the original dataset.


# Save Cleaned Datasets

In [45]:
# Define the path to the "Data/" folder inside your repository
data_folder = "Data/"

# Save each DataFrame to a compressed CSV file
basics.to_csv(data_folder + "basics_compressed_cleaned.csv.gz", compression="gzip", index=False)
akas.to_csv(data_folder + "akas_compressed_cleaned.csv.gz", compression="gzip", index=False)
ratings.to_csv(data_folder + "ratings_compressed_cleaned.csv.gz", compression="gzip", index=False)

print("DataFrames saved to compressed CSV files.")

DataFrames saved to compressed CSV files.
