In [45]:
import pandas as pd

In [46]:
# Import data and check the dataset structure
df = pd.read_csv('netflix_titles.csv')

### **Data Cleaning**

**Treat the Nulls - Null Value Assessment**

In [47]:
# Check for missing values
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

**Drop rows with empty values**

In [48]:
# Delete the rows where are empty values
df.dropna(subset=['director', 'cast', 'country', 'date_added', 'rating', 'duration'], inplace=True)

**Split the columns**

In [49]:
# Separate the cast column into actor_1, actor_2, and actor_3
df['actor_1'] = df['cast'].str.split(', ').str[0]
df['actor_2'] = df['cast'].str.split(', ').str[1]
df['actor_3'] = df['cast'].str.split(', ').str[2]

# Separate the date_added column into month_added and year_added
df['month_added']=df['date_added'].str.split(' ').str[0]
df['year_added']=df['date_added'].str.split(', ').str[1]

**Populate the missing rows**

In [50]:
# Replace Null values with the word 'Empty'
df.fillna('Empty', inplace=True)

**Drop unneeded columns - Column Pruning**

In [51]:
# Drop columns that are unneeded or do not provide relevant information
df.drop(columns=['show_id', 'cast', 'date_added', 'description'], inplace=True)

### **Issues to Address**

**Data Type Conversion**

In [52]:
# Check the data types of the columns. All of them are strings except for release_year
df.dtypes

# Convert the column to str.
# df['release_year'] = df['release_year'].astype(str)

type            object
title           object
director        object
country         object
release_year     int64
rating          object
duration        object
listed_in       object
actor_1         object
actor_2         object
actor_3         object
month_added     object
year_added      object
dtype: object

**Data Transformation**

In [53]:
# Separate the listed_in column into the first listed and added to a new colummn
df['first_listed_in'] = df['listed_in'].str.split(', ').str[0]
df.drop(columns=['listed_in'], inplace=True)

**Data Export**

In [54]:
df['country'] = df['country'].str.split(', ').str[0]
df['release_year'] = df['release_year'].astype(str)
df['year_added'] = df['year_added'].astype(str)

df.dtypes

type               object
title              object
director           object
country            object
release_year       object
rating             object
duration           object
actor_1            object
actor_2            object
actor_3            object
month_added        object
year_added         object
first_listed_in    object
dtype: object

In [55]:
df.head(5)

Unnamed: 0,type,title,director,country,release_year,rating,duration,actor_1,actor_2,actor_3,month_added,year_added,first_listed_in
7,Movie,Sankofa,Haile Gerima,United States,1993,TV-MA,125 min,Kofi Ghanaba,Oyafunmike Ogunlano,Alexandra Duah,September,2021,Dramas
8,TV Show,The Great British Baking Show,Andy Devonshire,United Kingdom,2021,TV-14,9 Seasons,Mel Giedroyc,Sue Perkins,Mary Berry,September,2021,British TV Shows
9,Movie,The Starling,Theodore Melfi,United States,2021,PG-13,104 min,Melissa McCarthy,Chris O'Dowd,Kevin Kline,September,2021,Comedies
12,Movie,Je Suis Karl,Christian Schwochow,Germany,2021,TV-MA,127 min,Luna Wedler,Jannis Niewöhner,Milan Peschel,September,2021,Dramas
24,Movie,Jeans,S. Shankar,India,1998,TV-14,166 min,Prashanth,Aishwarya Rai Bachchan,Sri Lakshmi,September,2021,Comedies


In [56]:
df_titles = pd.DataFrame()
df_titles['type'] = df['type'].unique()
df_titles['count'] = df['type'].value_counts().values
df_titles

Unnamed: 0,type,count
0,Movie,5185
1,TV Show,147


In [57]:
df_countries = pd.DataFrame()
df_countries['country'] = df['country'].unique()
df_countries['counter'] = df['country'].value_counts().values
df_countries

Unnamed: 0,country,counter
0,United States,2130
1,United Kingdom,910
2,Germany,346
3,India,166
4,China,145
...,...,...
74,Slovenia,1
75,Guatemala,1
76,Jamaica,1
77,Somalia,1


In [58]:
df.to_csv('netflix_titles_cleaned.csv', index=False)
df_titles.to_csv('netflix_titles_type.csv', index=False)
df_countries.to_csv('netflix_titles_country.csv', index=False)