In [1]:
import pandas as pd

In [2]:
#Loading the csv file
df1 = pd.read_csv(r"C:\Users\govin\Downloads\Elevate Labs\Day 1\netflix_titles.csv")
df1

#Made a copy so as to not make any changes in the original file
df=df1.copy()

In [3]:
#Shows the first few rows and also gives the summary of the dataset
print(df.head())
print(df.info())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [4]:
#Checks for any null values
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [5]:
#Drops those null values
df = df.dropna(subset=['country','cast','rating','duration','date_added'])
print(df.isnull().sum())

show_id            0
type               0
title              0
director        1958
cast               0
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64


In [6]:
#Fills the empty cells in the director column with Unknown
df['director'] = df['director'].fillna('Unknown')

In [7]:
#Confirms for any further null values
print(df.isnull().sum())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [8]:
#Checks for any duplicate rows
print("Duplicate rows:", df.duplicated().sum())

Duplicate rows: 0


In [9]:
#Takes the country count by converting comma-separated country names into lists and then removing the spaces from before and after the country names
#and finally taking the count of each country
df['country'] = df['country'].str.split(',')
df = df.explode('country') 
df['country'] = df['country'].str.strip()
print(df['country'].value_counts())

country
United States     3274
India             1007
United Kingdom     708
Canada             414
France             361
                  ... 
Albania              1
Georgia              1
Slovakia             1
Bermuda              1
Montenegro           1
Name: count, Length: 116, dtype: int64


In [10]:
#Standarised values in the column named country and converted to upper case
text_cols = ['country']
for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.upper()

In [11]:
#Converting to datetime datatype and converting to date/month/year
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['date_added'] = df['date_added'].dt.strftime('%d %B, %Y')

In [12]:
#Checking the datatypes of each column
df['release_year'] = df['release_year'].astype(int)
print(df.dtypes)

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int32
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [13]:
#Renaming column headers to be clean
df.columns = [col.replace('_', ' ').title() for col in df.columns]
df

Unnamed: 0,Show Id,Type,Title,Director,Cast,Country,Date Added,Release Year,Rating,Duration,Listed In,Description
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",SOUTH AFRICA,"24 September, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",INDIA,"24 September, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...",UNITED STATES,"24 September, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...",GHANA,"24 September, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...",BURKINA FASO,"24 September, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
...,...,...,...,...,...,...,...,...,...,...,...,...
8801,s8802,Movie,Zinzana,Majid Al Ansari,"Ali Suliman, Saleh Bakri, Yasa, Ali Al-Jabri, ...",JORDAN,"09 March, 2016",2015,TV-MA,96 min,"Dramas, International Movies, Thrillers",Recovering alcoholic Talal wakes up inside a s...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",UNITED STATES,"20 November, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",UNITED STATES,"01 November, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",UNITED STATES,"11 January, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [14]:
#Grouping values by movie and tv show(if needed)
df = df.sort_values(by='Type')

In [15]:
#Save to a new csv file 
df.to_csv("netflix_titles_cleaned.csv", index=False)