# DATA CLEANING

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv('./1_data/imdb_top_1000.csv')

In [4]:
df.head(1)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [12]:
# Removing the unnecessary columns
df.drop(['Poster_Link', 'Overview'], axis=1, inplace=True)

In [13]:
# Converting all columns to lowercase 
df.columns = df.columns.str.lower()

In [16]:
# Removing the 'min' from 'runtime' column
df['runtime'] = df['runtime'].str.replace(' min', '')

In [18]:
# Removing the ',' from 'gross' column
df['gross'] = df['gross'].str.replace(',', '')

In [26]:
# Converting data types
df[['released_year', 'runtime', 'gross']] = df[['released_year', 'runtime', 'gross']].astype('Int64')

In [28]:
# Counting the total number of null values in each column
null_counts = df.isnull().sum()
print('Total number of missing values by column:')
null_counts

Total number of missing values by column:


series_title       0
released_year      1
certificate      101
runtime            0
genre              0
imdb_rating        0
meta_score       157
director           0
star1              0
star2              0
star3              0
star4              0
no_of_votes        0
gross            169
dtype: int64

In [29]:
# Counting the duplicated values
df.duplicated().sum()

0

In [31]:
# Setting the 'series_title' as index
df.set_index('series_title', inplace=True)

In [32]:
df

Unnamed: 0_level_0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross
series_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Sleuth,1972,PG,138,"Mystery, Thriller",8.0,,Joseph L. Mankiewicz,Laurence Olivier,Michael Caine,Alec Cawthorne,John Matthews,44748,4081254
Nuovo Cinema Paradiso,1988,U,155,"Drama, Romance",8.5,80.0,Giuseppe Tornatore,Philippe Noiret,Enzo Cannavale,Antonella Attili,Isa Danieli,230763,11990401
Happiness,1998,,134,"Comedy, Drama",7.7,81.0,Todd Solondz,Jane Adams,Jon Lovitz,Philip Seymour Hoffman,Dylan Baker,66408,2807390
Shadow of a Doubt,1943,PG,108,"Film-Noir, Thriller",7.8,94.0,Alfred Hitchcock,Teresa Wright,Joseph Cotten,Macdonald Carey,Henry Travers,59556,
"Planes, Trains & Automobiles",1987,U,93,"Comedy, Drama",7.6,72.0,John Hughes,Steve Martin,John Candy,Laila Robins,Michael McKean,124773,49530280
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Un long dimanche de fiançailles,2004,U,133,"Drama, Mystery, Romance",7.6,76.0,Jean-Pierre Jeunet,Audrey Tautou,Gaspard Ulliel,Jodie Foster,Dominique Pinon,70925,6167817
Groundhog Day,1993,U,101,"Comedy, Fantasy, Romance",8.0,72.0,Harold Ramis,Bill Murray,Andie MacDowell,Chris Elliott,Stephen Tobolowsky,577991,70906973
The Man Who Would Be King,1975,PG,129,"Adventure, History, War",7.8,91.0,John Huston,Sean Connery,Michael Caine,Christopher Plummer,Saeed Jaffrey,44917,
Short Cuts,1993,R,188,"Comedy, Drama",7.7,79.0,Robert Altman,Andie MacDowell,Julianne Moore,Tim Robbins,Bruce Davison,42275,6110979


In [33]:
# Saving the clean file
df.to_csv('./1_data/imdb_top_1000_clean.csv')