In [180]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
plt.style.use('ggplot')
from datetime import datetime

In [181]:
netflix_df = pd.read_csv('netflix_titles.csv')
netflix_df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [182]:
netflix_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [183]:
#Creating a function to find percentage of rows missing

def missing_rows_percentage(df):
    missing_rows = (np.sum(netflix_df.isnull())/len(netflix_df.index))*100
    return missing_rows
missing_rows_percentage(netflix_df)

show_id          0.000000
type             0.000000
title            0.000000
director        29.908028
cast             9.367549
country          9.435676
date_added       0.113546
release_year     0.000000
rating           0.045418
duration         0.034064
listed_in        0.000000
description      0.000000
dtype: float64

In [184]:
#Changing columns names
netflix_df.rename(columns = {'date_added':'date_added_netflix', 'release_year':'actual_release_year','listed_in':'genres'},inplace=True)

In [185]:
netflix_df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country',
       'date_added_netflix', 'actual_release_year', 'rating', 'duration',
       'genres', 'description'],
      dtype='object')

In [186]:
#converting the date_added_netflix from string to datetime
netflix_df['date_added_netflix'] = pd.to_datetime(netflix_df['date_added_netflix'])
netflix_df['date_added_netflix'].dtype

dtype('<M8[ns]')

In [187]:
#spilting the date,month and day into three seperate columns and dropping the original columns
netflix_df['netflix_year'] = netflix_df['date_added_netflix'].dt.year.astype('Int64')
#netflix_df.insert(loc = 7,column ='year', value = 'netflix_year')
netflix_df['netflix_month'] = netflix_df['date_added_netflix'].dt.month.astype('Int64')
netflix_df['netflix_day'] = netflix_df['date_added_netflix'].dt.day.astype('Int64')
netflix_df.drop(columns='date_added_netflix',inplace=True)

In [188]:
netflix_df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country',
       'actual_release_year', 'rating', 'duration', 'genres', 'description',
       'netflix_year', 'netflix_month', 'netflix_day'],
      dtype='object')

In [189]:
# create a new column to find duration between the original release and movie/tv show added on netflix
netflix_df['differenceinyears'] = netflix_df['netflix_year'] - netflix_df['actual_release_year']
netflix_df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,actual_release_year,rating,duration,genres,description,netflix_year,netflix_month,netflix_day,differenceinyears
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021,9,25,1
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9,24,0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021,9,24,0
3,s4,TV Show,Jailbirds New Orleans,,,,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021,9,24,0
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021,9,24,0


In [190]:
#spliting the duration column in min/seasons and dropping the original column
new = netflix_df['duration'].str.split(' ',n=1,expand=True)
netflix_df['totalduration']  = new[0]
netflix_df['mins_or_seasons']  = new[1]
netflix_df.drop(columns='duration',inplace=True)
netflix_df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,actual_release_year,rating,genres,description,netflix_year,netflix_month,netflix_day,differenceinyears,totalduration,mins_or_seasons
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2020,PG-13,Documentaries,"As her father nears the end of his life, filmm...",2021,9,25,1,90,min
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9,24,0,2,Seasons
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021,9,24,0,1,Season
3,s4,TV Show,Jailbirds New Orleans,,,,2021,TV-MA,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021,9,24,0,1,Season
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021,9,24,0,2,Seasons


In [None]:
# tomorrow task -
# fill director and country column
# Drop the min duration rows from rating columns
# split genre and country column by making copy of original data frame
# find if the column title has any duplicates
