In [1]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints',
 'akas_compressed.csv.gz',
 'basics_compressed.csv.gz',
 'ratings_compressed.csv.gz',
 'title-akas-us-only.csv',
 'title.basics (1).tsv.gz',
 'title.ratings (1).tsv.gz']

In [2]:
import pandas as pd
import numpy as np

In [3]:
pip install --upgrade pandas

Note: you may need to restart the kernel to use updated packages.


In [4]:
#Import title basics dataset
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)


In [5]:
# Import akas dataset
akas_url = r"C:\Users\admin\Documents\GitHub\IMDB-Movies-Analysis\Data\title-akas-us-only.csv"


akas = pd.read_csv(akas_url, low_memory= False)

In [6]:
# Import ratings dataset
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

ratings = pd.read_csv(ratings_url, sep="\t", low_memory= False)

In [7]:
# Create a copy of each dataset
basics2 = basics.copy()
akas2 = akas.copy()
ratings2 = ratings.copy()


In [8]:
basics.head(15)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,1,"Documentary,Short"
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,\N,1,"Documentary,Short"


In [9]:
ratings.head(30)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1990
1,tt0000002,5.8,264
2,tt0000003,6.5,1863
3,tt0000004,5.5,177
4,tt0000005,6.2,2647
5,tt0000006,5.0,182
6,tt0000007,5.4,829
7,tt0000008,5.4,2131
8,tt0000009,5.3,205
9,tt0000010,6.9,7257


In [10]:
akas.head(15)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
5,tt0000005,7,Blacksmithing,US,\N,\N,informal alternative title,0
6,tt0000006,3,Chinese Opium Den,US,\N,imdbDisplay,\N,0
7,tt0000007,1,Corbett and Courtney Before the Kinetograph,US,\N,imdbDisplay,\N,0
8,tt0000007,5,The Corbett-Courtney Fight,US,\N,alternative,\N,0
9,tt0000007,6,Jim Corbett vs. Peter Courtney,US,\N,alternative,\N,0


In [11]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10100795 entries, 0 to 10100794
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 693.6+ MB


In [12]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452562 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [13]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340671 entries, 0 to 1340670
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1340671 non-null  object 
 1   averageRating  1340671 non-null  float64
 2   numVotes       1340671 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.7+ MB


In [14]:
# Replace "\N" with np.nan
akas.replace("\\N", "np.nan", inplace=True)
basics.replace("\\N", "np.nan", inplace=True)
ratings.replace("\\N", "np.nan", inplace=True)

In [15]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers



0            True
1            True
2           False
3           False
4            True
            ...  
10100790    False
10100791    False
10100792    False
10100793    False
10100794    False
Name: tconst, Length: 10100795, dtype: bool

In [16]:
# Filter the basics DataFrame to keep only titles also listed in the US filtered akas
basics = basics[keepers]

In [17]:
# Drop NAs in runtimeMinutes, genres, startYear columns
basics = basics.dropna(subset=['runtimeMinutes', 'genres', 'startYear'])

In [18]:
#Keep only movies where titleType==Movie
basics = basics.loc[basics['titleType'] == 'movie']

In [19]:
# Replace startYear '\\N' with NaN
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')


In [20]:
#Convert the startYear column to float data type
basics['startYear'] = basics['startYear'].astype(float)

In [21]:
#Filter the dataframe using startYear. Keep years between 2000-2021
basics = basics[basics["startYear"]>= 2000]

In [22]:
#Filter the dataframe using startYear. Keep years between 2000-2021
basics = basics[basics["startYear"]<= 2021]

In [23]:
#Eliminate movies that include "Documentary" in the genre
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]


In [24]:
# Include the US by using the filter akas dataframe
ratings_keepers =ratings['tconst'].isin(akas['titleId'])
ratings = ratings[ratings_keepers]

# View modified dataset info

In [25]:
# View modified basics dataset
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97499 entries, 34800 to 10100593
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          97499 non-null  object 
 1   titleType       97499 non-null  object 
 2   primaryTitle    97497 non-null  object 
 3   originalTitle   97497 non-null  object 
 4   isAdult         97499 non-null  object 
 5   startYear       97499 non-null  float64
 6   endYear         97499 non-null  object 
 7   runtimeMinutes  97499 non-null  object 
 8   genres          97499 non-null  object 
dtypes: float64(1), object(8)
memory usage: 7.4+ MB


In [26]:
# Calculate null value sums and row counts for original dataset
null_counts_basics2 = basics2.isna().sum()
row_count_basics2 = len(basics2)

# Calculate null value sums and row counts for modified dataset
null_counts_basics = basics.isna().sum()
row_count_basics = len(basics)

# Calculate the differences
null_count_diff = null_counts_basics2 - null_counts_basics
row_count_diff = row_count_basics2 - row_count_basics

# Print out the differences
print("There were {} null values modified and {} rows eliminated from the original dataset.".format(
    null_count_diff.sum(), row_count_diff))

There were 47 null values modified and 10003296 rows eliminated from the original dataset.


In [27]:
# View modified akas dataset
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452562 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [28]:
# Calculate null value sums and row counts for original dataset
null_counts_akas2 = akas2.isna().sum()
row_count_akas2 = len(akas2)

# Calculate null value sums and row counts for modified dataset
null_counts_akas = akas.isna().sum()
row_count_akas = len(akas)

# Calculate the differences
null_count_diff_akas = null_counts_akas - null_counts_akas2
row_count_diff_akas = row_count_akas2 - row_count_akas

# Print out the differences
print("There were {} null values modified and {} rows eliminated from the original dataset.".format(
    null_count_diff_akas.sum(), row_count_diff_akas))

There were 0 null values modified and 0 rows eliminated from the original dataset.


In [29]:
# View modified ratings dataset
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 505173 entries, 0 to 1340646
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         505173 non-null  object 
 1   averageRating  505173 non-null  float64
 2   numVotes       505173 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.4+ MB


In [30]:
# Calculate null value sums and row counts for original dataset
null_counts_ratings2 = ratings2.isna().sum()
row_count_ratings2 = len(ratings2)

# Calculate null value sums and row counts for modified dataset
null_counts_ratings = ratings.isna().sum()
row_count_ratings = len(ratings)

# Calculate the differences
null_count_diff_ratings = null_counts_ratings - null_counts_ratings2
row_count_diff_ratings = row_count_ratings2 - row_count_ratings

# Print out the differences
print("There were {} null values modified and {} rows eliminated from the original dataset.".format(
    null_count_diff_ratings.sum(), row_count_diff_ratings))


There were 0 null values modified and 835498 rows eliminated from the original dataset.


# Save each dataset

In [31]:
# Define the path to the "Data/" folder inside your repository
data_folder = "Data/"

# Save each DataFrame to a compressed CSV file
basics.to_csv(data_folder + "basics_compressed.csv.gz", compression="gzip", index=False)
akas.to_csv(data_folder + "akas_compressed.csv.gz", compression="gzip", index=False)
ratings.to_csv(data_folder + "ratings_compressed.csv.gz", compression="gzip", index=False)

print("DataFrames saved to compressed CSV files.")

DataFrames saved to compressed CSV files.
