# Requirements

1. Problem Statement
2. DataSet
3. Sources & Access of the Data
4. Understand the Data 

# STEP_01 :- Data Gathering and Reading

In [1]:
 # import libraries
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("netflix_titles.csv")

In [3]:
# REMEMBER : Do not print the entire data because it is a time taking process
df.sample(5) # You can view a random 5 rows in the dataset

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
6394,s6395,Movie,Bushwick,"Cary Murnion, Jonathan Milott","Dave Bautista, Brittany Snow, Angelic Zambrana...",United States,"November 24, 2017",2017,TV-MA,94 min,"Action & Adventure, Independent Movies",When their diverse neighborhood is besieged by...
5195,s5196,TV Show,Together,,"Jeanette Aw, Dai Xiangyu, Elvin Ng, Eelyn Kok,...",Singapore,"November 1, 2017",2009,TV-14,1 Season,"International TV Shows, TV Dramas","Spanning 30 years, a group of dedicated and di..."
682,s683,TV Show,Why Are You Like This,,"Naomi Higgins, Olivia Junkeer, Wil King",Australia,"June 19, 2021",2021,TV-MA,1 Season,"International TV Shows, TV Comedies","Three best friends negotiate work, fun, identi..."
1177,s1178,Movie,Hospital,Brody Chu,"Lin Po-Hung, Tai Bo, Jacqueline Zhu, Lei Hong,...",Taiwan,"March 20, 2021",2020,TV-MA,90 min,"Horror Movies, International Movies","In an abandoned hospital in Tainan, visitors s..."
2852,s2853,Movie,W.E.,Madonna,"Abbie Cornish, Andrea Riseborough, James D'Arc...",United Kingdom,"March 3, 2020",2011,R,119 min,"Dramas, Romantic Movies",This glossy ensemble drama juxtaposes the live...


In [4]:
df.drop(["show_id","description"] , axis=1,inplace=True)

In [5]:
df.sample(5)

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in
593,Movie,Snow Day,Chris Koch,"Chris Elliott, Mark Webber, Jean Smart, Schuyl...",United States,"July 1, 2021",2000,PG,89 min,"Children & Family Movies, Comedies"
8462,Movie,The Power of Grayskull: The Definitive History...,"Randall Lobb, Robert McCallum",,"Canada, United States","August 24, 2018",2017,TV-14,96 min,Documentaries
3478,Movie,Mo Gilligan: Momentum,Chris Howe,Mo Gilligan,United Kingdom,"September 30, 2019",2019,TV-MA,64 min,Stand-Up Comedy
861,Movie,Little Singham Future mein Satakli,Prakash Satam,"Anamaya Verma, Ganesh Divekar, Jigna Bharadhwa...",,"May 20, 2021",2021,TV-Y7,47 min,"Children & Family Movies, Comedies"
3332,Movie,Christmas Survival,James Dearden,"Julian Ovenden, Gemma Whelan, Joely Richardson...",United Kingdom,"November 1, 2019",2018,TV-MA,101 min,Comedies


In [6]:
# To know the no.of rows and no.of columns
df.shape

(8807, 10)

In [7]:
# To know the no.of cells present in the  
df.size

88070

In [8]:
# To know the index values of the data where it starts and where it ends
df.index

RangeIndex(start=0, stop=8807, step=1)

In [9]:
# To know the names of the columns present in the dataset
df.columns

Index(['type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in'],
      dtype='object')

# STEP_2  :-  Data Cleaning

In [10]:
# if you do data analysis on the non clean data then your analysis might go wrong
# Maintaining the data quality is the very important metrics

df.info() # it tells us how many non-null values are there

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8807 non-null   object
 1   title         8807 non-null   object
 2   director      6173 non-null   object
 3   cast          7982 non-null   object
 4   country       7976 non-null   object
 5   date_added    8797 non-null   object
 6   release_year  8807 non-null   int64 
 7   rating        8803 non-null   object
 8   duration      8804 non-null   object
 9   listed_in     8807 non-null   object
dtypes: int64(1), object(9)
memory usage: 688.2+ KB


In [11]:
# To know how many null values are there in each column
df.isna().sum()

type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
dtype: int64

In [12]:
# To know most occuring value in the data set in a particular column
# it is helpful to replace the null values
df["country"].value_counts()

United States                             2818
India                                      972
United Kingdom                             419
Japan                                      245
South Korea                                199
                                          ... 
Romania, Bulgaria, Hungary                   1
Uruguay, Guatemala                           1
France, Senegal, Belgium                     1
Mexico, United States, Spain, Colombia       1
United Arab Emirates, Jordan                 1
Name: country, Length: 748, dtype: int64

In [13]:
df["country"].value_counts().head(3)

United States     2818
India              972
United Kingdom     419
Name: country, dtype: int64

In [14]:
mode_country = df["country"].mode()[0]
mode_country
df["country"]=df["country"].fillna(mode_country)

In [15]:
df["country"].isna().sum() # Always check weather the column has replaced the null values or not

0

In [16]:
df["date_added"].value_counts()

January 1, 2020      109
November 1, 2019      89
March 1, 2018         75
December 31, 2019     74
October 1, 2018       71
                    ... 
December 4, 2016       1
November 21, 2016      1
November 19, 2016      1
November 17, 2016      1
January 11, 2020       1
Name: date_added, Length: 1767, dtype: int64

In [17]:
df["date_added"].value_counts().head(5)

January 1, 2020      109
November 1, 2019      89
March 1, 2018         75
December 31, 2019     74
October 1, 2018       71
Name: date_added, dtype: int64

In [18]:
mode_date = df["date_added"].mode()[0]
df["date_added"] = df["date_added"].fillna(mode_date)
df["date_added"].isna().sum() # Always check weather the column has replaced the null values or not

0

In [19]:
df.isna().sum()

type               0
title              0
director        2634
cast             825
country            0
date_added         0
release_year       0
rating             4
duration           3
listed_in          0
dtype: int64

In [20]:
mode_rating = df["rating"].mode()[0]
df["rating"] = df["rating"].fillna(mode_rating)
df["rating"].isna().sum() # Always check weather the column has replaced the null values or not

0

In [21]:
mode_duration = df["duration"].mode()[0]
df["duration"] = df["duration"].fillna(mode_duration)
df["duration"].isna().sum() # Always check weather the column has replaced the null values or not

0

In [22]:
df.isna().sum()

type               0
title              0
director        2634
cast             825
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
dtype: int64

In [23]:
# here we left the two columns director and cast because we want replace them with any string
df["director"]=df["director"].fillna("Not Menctioned")
df["director"].isna().sum()
df["cast"]=df["cast"].fillna("Not Menctioned")
df["cast"].isna().sum()

0

In [24]:
df.isna().sum() # We are done with replacing all null values 
# null value eradication of the data cleaning part is done

type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
dtype: int64

In [25]:
df["rating"].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR',
       'TV-Y7-FV', 'UR'], dtype=object)

In [26]:
df = df[~df["rating"].str.contains("min")] # removes the values in coumn "rating" which contain "min" string key word

In [27]:
df["rating"].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR'], dtype=object)

In [28]:
df["date_added"].unique()

array(['September 25, 2021', 'September 24, 2021', 'September 23, 2021',
       ..., 'December 6, 2018', 'March 9, 2016', 'January 11, 2020'],
      dtype=object)

In [29]:
df["date_added"] = df["date_added"].str.replace(",","") # replace comma with empty space

In [30]:
df["date_added"].unique()

array(['September 25 2021', 'September 24 2021', 'September 23 2021', ...,
       'December 6 2018', 'March 9 2016', 'January 11 2020'], dtype=object)

In [31]:
df["date_added"] = pd.to_datetime(df["date_added"])

In [32]:
df["date_added"].unique()

array(['2021-09-25T00:00:00.000000000', '2021-09-24T00:00:00.000000000',
       '2021-09-23T00:00:00.000000000', ...,
       '2018-12-06T00:00:00.000000000', '2016-03-09T00:00:00.000000000',
       '2020-01-11T00:00:00.000000000'], dtype='datetime64[ns]')

In [33]:
# Create new columns year , Month , Date from date_added
df["year"] = df["date_added"].dt.year
df["month"] = df["date_added"].dt.month
df["day"] = df["date_added"].dt.day

In [34]:
df.sample(5)

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,year,month,day
2065,Movie,Children of the Sea,Ayumu Watanabe,"Mana Ashida, Hiiro Ishibashi, Seishu Uragami, ...","Japan, United States",2020-09-01,2019,TV-PG,112 min,"Anime Features, International Movies",2020,9,1
7782,TV Show,Power Rangers Time Force,Not Menctioned,"Jason Faunt, Erin Cahill, Kevin Kleinberg, Deb...",United States,2016-01-01,2001,TV-Y7,1 Season,Kids' TV,2016,1,1
5905,Movie,Ralphie May: Imperfectly Yours,Anthony Pierce,Ralphie May,United States,2015-06-29,2015,TV-MA,69 min,Stand-Up Comedy,2015,6,29
2114,Movie,Little Singham: Kaal Ka Badla,Prakash Satam,"Anamaya Verma, Arushi Talwar, Ganesh Divekar, ...",United States,2020-08-19,2020,TV-Y7,70 min,"Children & Family Movies, Comedies",2020,8,19
4252,Movie,A Twelve Year Night,Álvaro Brechner,"Antonio de la Torre, Chino Darín, Alfonso Tort...","Uruguay, Argentina, Spain",2018-12-28,2018,TV-MA,122 min,"Dramas, International Movies",2018,12,28


In [35]:
df["genre"] = df["listed_in"].str.split(",").str[0] # extacts the genre from the existing list

In [36]:
df["genre"].value_counts()

Dramas                          1600
Comedies                        1210
Action & Adventure               859
Documentaries                    829
International TV Shows           774
Children & Family Movies         605
Crime TV Shows                   399
Kids' TV                         388
Stand-Up Comedy                  334
Horror Movies                    275
British TV Shows                 253
Docuseries                       221
Anime Series                     176
International Movies             128
TV Comedies                      120
Reality TV                       120
Classic Movies                    80
TV Dramas                         67
Thrillers                         65
Movies                            54
TV Action & Adventure             40
Stand-Up Comedy & Talk Shows      34
Romantic TV Shows                 32
Classic & Cult TV                 22
Anime Features                    21
Independent Movies                20
Music & Musicals                  18
T

In [37]:
# similarly in column cast lead actors names are sufficient 
df.sample(5)

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,year,month,day,genre
6119,Movie,All Light Will End,Chris Blake,"Ashley Pereira, Alexandra Harris, Ted Welch, S...",United States,2019-02-02,2018,TV-MA,84 min,"Horror Movies, Thrillers",2019,2,2,Horror Movies
8727,Movie,When Hari Got Married,"Ritu Sarin, Tenzing Sonam",Not Menctioned,"United Kingdom, India, United States",2016-12-25,2013,NR,75 min,"Documentaries, International Movies",2016,12,25,Documentaries
742,Movie,Camellia Sisters,"Bao Nhan, Namcito","Le Khanh, Kaity Nguyen, Hong Van, Khuong Le, H...",Vietnam,2021-06-10,2021,TV-MA,117 min,"Dramas, International Movies",2021,6,10,Dramas
1828,TV Show,Unsolved Mysteries,Not Menctioned,Not Menctioned,United States,2020-10-19,2020,TV-MA,2 Seasons,"Crime TV Shows, Docuseries",2020,10,19,Crime TV Shows
7475,Movie,Moh Maya Money,Munish Bhardwaj,"Ranvir Shorey, Neha Dhupia, Devendra Chowhan, ...",India,2017-05-01,2016,TV-MA,109 min,"Dramas, Independent Movies, International Movies",2017,5,1,Dramas


In [38]:
df= df[df["cast"]!="Not Mentioned"]

In [39]:
df["lead_actor"] = df["cast"].str.split(",").str[0]

In [40]:
df["lead_actor"].value_counts().head(5)

Not Menctioned      825
Shah Rukh Khan       26
Akshay Kumar         23
Adam Sandler         20
Amitabh Bachchan     20
Name: lead_actor, dtype: int64

In [41]:
df.sample(5)

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,year,month,day,genre,lead_actor
8015,TV Show,Sid the Science Kid,Not Menctioned,"Drew Massey, Julianne Buescher, Victor Yerrid,...",United States,2020-03-31,2008,TV-Y,1 Season,Kids' TV,2020,3,31,Kids' TV,Drew Massey
182,Movie,Welcome Home Roscoe Jenkins,Malcolm D. Lee,"Martin Lawrence, James Earl Jones, Joy Bryant,...",United States,2021-09-01,2008,PG-13,114 min,Comedies,2021,9,1,Comedies,Martin Lawrence
3078,Movie,Albert Pinto Ko Gussa Kyun Aata Hai?,Soumitra Ranade,"Nandita Das, Manav Kaul, Saurabh Shukla, Kisho...",India,2019-12-31,2019,TV-MA,85 min,"Dramas, Independent Movies, International Movies",2019,12,31,Dramas,Nandita Das
5382,TV Show,Diamond Lover,Not Menctioned,"Rain, Tiffany Tang, Luo Jin, Dilraba Dilmurat,...",China,2017-07-07,2015,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ...",2017,7,7,International TV Shows,Rain
2897,Movie,The Last Thing He Wanted,Dee Rees,"Anne Hathaway, Ben Affleck, Willem Dafoe, Toby...",United States,2020-02-21,2020,R,116 min,"Dramas, Thrillers",2020,2,21,Dramas,Anne Hathaway


In [42]:
# as we used the required information from columns cast & listed_in we remove them 
df.drop(["cast","listed_in","date_added"],axis=1,inplace=True)

In [43]:
df.sample(5)

Unnamed: 0,type,title,director,country,release_year,rating,duration,year,month,day,genre,lead_actor
5343,Movie,Icarus,Bryan Fogel,United States,2017,TV-MA,121 min,2017,8,4,Documentaries,Bryan Fogel
7074,Movie,Indiscretion,John Stewart Muller,United States,2016,TV-MA,99 min,2017,2,1,Dramas,Mira Sorvino
7856,TV Show,Republic of Doyle,Not Menctioned,Canada,2014,TV-14,6 Seasons,2020,5,28,Crime TV Shows,Allan Hawco
5757,Movie,13TH,Ava DuVernay,United States,2016,TV-MA,101 min,2016,10,7,Documentaries,Not Menctioned
2487,Movie,No Longer kids,Samir Al Asfory,Egypt,1979,TV-14,237 min,2020,5,21,Comedies,Said Saleh


In [44]:
# now cleaned_data is present in the system folder for further analytics

In [45]:
#df.to_csv("cleaned_data.csv") # we can not only read the csv files we can also write csv also

In [46]:
df.info

<bound method DataFrame.info of          type                  title         director        country  \
0       Movie   Dick Johnson Is Dead  Kirsten Johnson  United States   
1     TV Show          Blood & Water   Not Menctioned   South Africa   
2     TV Show              Ganglands  Julien Leclercq  United States   
3     TV Show  Jailbirds New Orleans   Not Menctioned  United States   
4     TV Show           Kota Factory   Not Menctioned          India   
...       ...                    ...              ...            ...   
8802    Movie                 Zodiac    David Fincher  United States   
8803  TV Show            Zombie Dumb   Not Menctioned  United States   
8804    Movie             Zombieland  Ruben Fleischer  United States   
8805    Movie                   Zoom     Peter Hewitt  United States   
8806    Movie                 Zubaan      Mozez Singh          India   

      release_year rating   duration  year  month  day  \
0             2020  PG-13     90 min  2021   

In [47]:
df.shape

(8804, 12)