In [None]:
# Week 2: Cleaning IMDb Dataset (Handling Missing Values, Outliers, and Duplicates)

# This notebook contains all Python-based cleaning steps derived from insights gathered during the OpenRefine exploration.
# Please review the OpenRefine documentation beforehand, as it thoroughly explains the visual findings that guide our cleaning process.
# OpenRefine was used to visually identify inconsistencies, formatting issues, and potential data discrepancies.
# Python is used here to carry out precise and reproducible data cleaning for the IMDb dataset.
# A separate notebook contains the TMDB data cleaning process, the two datasets were divided for clarity and ease of visibility.

In [20]:
#import functions

import requests
import pandas as pd
from datetime import datetime
import json
import os
import hashlib
from pathlib import Path
import numpy as np
import gzip
import io


In [21]:
#confirm raw dataset is read in properly

imdb = pd.read_csv('imdb_top_1000.csv')

In [22]:
imdb.head()

Unnamed: 0,title,director,release_year,runtime,genre,rating,metascore,gross
0,The Shawshank Redemption,Frank Darabont,(1994),142 min,Drama,9.3,82,$28.34M
1,The Godfather,Francis Ford Coppola,(1972),175 min,"Crime, Drama",9.2,100,$134.97M
2,The Dark Knight,Christopher Nolan,(2008),152 min,"Action, Crime, Drama",9.0,84,$534.86M
3,Schindler's List,Steven Spielberg,(1993),195 min,"Biography, Drama, History",9.0,95,$96.90M
4,12 Angry Men,Sidney Lumet,(1957),96 min,"Crime, Drama",9.0,97,$4.36M


In [24]:
#Now that the dataset is loaded in, I will proceed to clean the imdb data

# Check data types, missing values, and overall info
imdb.info()

# Look for duplicate rows (just to confirm none slipped in)
imdb.duplicated().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   director      1000 non-null   object 
 2   release_year  1000 non-null   object 
 3   runtime       1000 non-null   object 
 4   genre         1000 non-null   object 
 5   rating        1000 non-null   float64
 6   metascore     1000 non-null   int64  
 7   gross         1000 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


0

In [25]:
# Check for repeated titles to confirm what OpenRefine found
duplicate_titles = imdb[imdb.duplicated(subset='title', keep=False)].sort_values('title')
duplicate_titles

Unnamed: 0,title,director,release_year,runtime,genre,rating,metascore,gross
283,All Quiet on the Western Front,Lewis Milestone,(1930),152 min,"Drama, War",8.1,91,$3.27M
609,All Quiet on the Western Front,Edward Berger,(2022),148 min,"Action, Drama, War",7.8,76,0
377,Beauty and the Beast,Gary Trousdale,(1991),84 min,"Animation, Family, Fantasy",8.0,95,$218.97M
591,Beauty and the Beast,Jean Cocteau,(1946),93 min,"Drama, Fantasy, Romance",7.9,92,$0.30M
138,Drishyam,Jeethu Joseph,(2013),160 min,"Crime, Drama, Thriller",8.3,0,0
188,Drishyam,Nishikant Kamat,(2015),163 min,"Crime, Drama, Mystery",8.2,0,$0.74M
89,Drishyam 2,Jeethu Joseph,(2021),152 min,"Crime, Drama, Thriller",8.4,0,0
178,Drishyam 2,Abhishek Pathak,(2022),140 min,"Crime, Drama, Mystery",8.2,0,0
94,Scarface,Brian De Palma,(1983),170 min,"Crime, Drama",8.3,65,$45.60M
898,Scarface,Howard Hawks,(1932),93 min,"Action, Crime, Drama",7.7,90,0


In [26]:
# Strip whitespace from director names to avoid any hidden inconsistencies
imdb['director'] = imdb['director'].str.strip()

# Check unique director count and sample a few names to confirm theres no odd whitespaces
print("Unique directors:", imdb['director'].nunique())
imdb['director'].sample(5)

Unique directors: 560


451    Nuri Bilge Ceylan
30        Hayao Miyazaki
403          Woody Allen
487      Stephen Chbosky
684       Clint Eastwood
Name: director, dtype: object

In [27]:
# Remove parentheses and Roman numeral suffixes like (I), (II), (III)
imdb['release_year'] = imdb['release_year'].str.replace(r'\(|\)', '', regex=True)
imdb['release_year'] = imdb['release_year'].str.replace(r'(I+)$', '', regex=True)

# Strip any trailing whitespaces to confirm its read in correctly 
imdb['release_year'] = imdb['release_year'].str.strip()

# Convert to numeric (integer)
imdb['release_year'] = pd.to_numeric(imdb['release_year'], errors='coerce').astype('Int64')

# Mark empty or invalid release years as NaN
imdb['release_year'] = pd.to_numeric(imdb['release_year'], errors='coerce')

# Drop rows where release_year is missing
imdb = imdb.dropna(subset=['release_year'])

In [28]:
# Remove the "min" text, strip spaces, convert to integer
imdb['runtime_in_minutes'] = imdb['runtime'].str.replace('min', '', regex=False).str.strip()
imdb['runtime_in_minutes'] = pd.to_numeric(imdb['runtime_in_minutes'], errors='coerce').astype('Int64')

# Drop the old runtime column
imdb.drop(columns=['runtime'], inplace=True)

In [29]:
# Strip whitespace and check unique values to confirm everything is read in correctly 
imdb['genre'] = imdb['genre'].str.strip()
imdb['genre'].value_counts().head(10)

genre
Drama                           88
Drama, Romance                  37
Comedy, Drama                   34
Crime, Drama, Mystery           30
Comedy, Drama, Romance          30
Action, Crime, Drama            28
Animation, Adventure, Comedy    28
Biography, Drama, History       26
Crime, Drama, Thriller          25
Crime, Drama                    23
Name: count, dtype: int64

In [30]:
# Convert rating from string to float for ease of analysis
imdb['rating'] = pd.to_numeric(imdb['rating'], errors='coerce')

In [31]:
# Convert to numeric values to ensure it is ready for integration and analysis
imdb['metascore'] = pd.to_numeric(imdb['metascore'], errors='coerce')

# Replace 0 with NaN (missing data), it makes it easier to drop these values, as they are missing data points that could skew our data
imdb['metascore'].replace(0, np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb['metascore'].replace(0, np.nan, inplace=True)


In [32]:
# Remove "$" and "M", strip spaces, and convert the variables to numeric values 
imdb['gross_in_millions'] = imdb['gross'].str.replace('$', '', regex=False).str.replace('M', '', regex=False).str.strip()
imdb['gross_in_millions'] = pd.to_numeric(imdb['gross_in_millions'], errors='coerce')

# Replace 0s with NaN
imdb['gross_in_millions'].replace(0, np.nan, inplace=True)

# Drop old column name 
imdb.drop(columns=['gross'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb['gross_in_millions'].replace(0, np.nan, inplace=True)


In [34]:
# Drop rows where critical variables are missing to avoid bias/skews in data
imdb_clean = imdb.dropna(subset=['metascore', 'gross_in_millions'])

# Confirm cleaning results
imdb_clean.info()
imdb_clean.head()

<class 'pandas.core.frame.DataFrame'>
Index: 705 entries, 0 to 997
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               705 non-null    object 
 1   director            705 non-null    object 
 2   release_year        705 non-null    Int64  
 3   genre               705 non-null    object 
 4   rating              705 non-null    float64
 5   metascore           705 non-null    float64
 6   runtime_in_minutes  705 non-null    Int64  
 7   gross_in_millions   705 non-null    float64
dtypes: Int64(2), float64(3), object(3)
memory usage: 50.9+ KB


Unnamed: 0,title,director,release_year,genre,rating,metascore,runtime_in_minutes,gross_in_millions
0,The Shawshank Redemption,Frank Darabont,1994,Drama,9.3,82.0,142,28.34
1,The Godfather,Francis Ford Coppola,1972,"Crime, Drama",9.2,100.0,175,134.97
2,The Dark Knight,Christopher Nolan,2008,"Action, Crime, Drama",9.0,84.0,152,534.86
3,Schindler's List,Steven Spielberg,1993,"Biography, Drama, History",9.0,95.0,195,96.9
4,12 Angry Men,Sidney Lumet,1957,"Crime, Drama",9.0,97.0,96,4.36


In [35]:
#Make finalized clean data file and confirm everything is up to par
imdb_clean.to_csv('imdb_cleaned.csv', index=False)
imdb_clean

Unnamed: 0,title,director,release_year,genre,rating,metascore,runtime_in_minutes,gross_in_millions
0,The Shawshank Redemption,Frank Darabont,1994,Drama,9.3,82.0,142,28.34
1,The Godfather,Francis Ford Coppola,1972,"Crime, Drama",9.2,100.0,175,134.97
2,The Dark Knight,Christopher Nolan,2008,"Action, Crime, Drama",9.0,84.0,152,534.86
3,Schindler's List,Steven Spielberg,1993,"Biography, Drama, History",9.0,95.0,195,96.90
4,12 Angry Men,Sidney Lumet,1957,"Crime, Drama",9.0,97.0,96,4.36
...,...,...,...,...,...,...,...,...
993,The Taking of Pelham One Two Three,Joseph Sargent,1974,"Action, Crime, Thriller",7.6,68.0,104,2.49
994,Control,Anton Corbijn,2007,"Biography, Drama, Music",7.6,78.0,122,0.87
995,A Very Long Engagement,Jean-Pierre Jeunet,2004,"Drama, Mystery, Romance",7.6,76.0,133,6.17
996,Shine,Scott Hicks,1996,"Biography, Drama, Music",7.6,87.0,105,35.81


In [36]:
#Summary of Python Cleaning
#This notebook implements all data transformations derived from OpenRefine insights:
#Removed formatting inconsistencies from `release_year` and `runtime`
#Verified uniqueness of titles and consistency of director names
#Converted textual numeric columns (`rating`, `metascore`, `gross`) into numeric types for easier analysis
#Replaced zeros with NA for missing numeric data
#Dropped incomplete rows to ensure data accuracy and interpretability
#Analyzed dataset to ensure the objectives above were fulfilled, and to confrim the data is complete and accurate

#By using both OpenRefine and Python, we wereable to provide a transparent, systematic approach.
#OpenRefine was used for rapid exploration and issue detection  
#Python was used for reproducible, programmatic data cleaning 