In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [263]:
t = pd.read_csv('processed_netflix.csv', parse_dates=[6])

In [271]:
t['country'].value_counts() == ''

United States                             False
India                                     False
United Kingdom                            False
Japan                                     False
South Korea                               False
                                          ...  
Romania, Bulgaria, Hungary                False
Uruguay, Guatemala                        False
France, Senegal, Belgium                  False
Mexico, United States, Spain, Colombia    False
United Arab Emirates, Jordan              False
Name: country, Length: 748, dtype: bool

In [27]:
df = pd.read_csv('processed_netflix.csv', parse_dates=[6])

In [28]:
df.head()

Unnamed: 0,#,show_id,type,title,country,release_year,date_added,director,cast,rating,listed_in,description,duration,duration_hours
0,1,s1,Movie,Dick Johnson Is Dead,United States,2020,2021-09-25,Kirsten Johnson,,PG-13,Documentaries,"As her father nears the end of his life, filmm...",90 min,1 hr 30 min
1,2,s2,TV Show,Blood & Water,South Africa,2021,2021-09-24,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2 Seasons,2 Seasons
2,3,s3,TV Show,Ganglands,,2021,2021-09-24,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,1 Season,1 Season
3,4,s4,TV Show,Jailbirds New Orleans,,2021,2021-09-24,,,TV-MA,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",1 Season,1 Season
4,5,s5,TV Show,Kota Factory,India,2021,2021-09-24,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",TV-MA,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2 Seasons,2 Seasons


In [29]:
# reset the index to column named '#'
df.set_index('#', inplace= True)

In [30]:
# rename index
df.index.name = 'id'

In [31]:
# confirm changes
df.head(1)

Unnamed: 0_level_0,show_id,type,title,country,release_year,date_added,director,cast,rating,listed_in,description,duration,duration_hours
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,s1,Movie,Dick Johnson Is Dead,United States,2020,2021-09-25,Kirsten Johnson,,PG-13,Documentaries,"As her father nears the end of his life, filmm...",90 min,1 hr 30 min


In [32]:
# save the column names for future use.
cols = [i for i in df.columns]
print(cols)

['show_id', 'type', 'title', 'country', 'release_year', 'date_added', 'director', 'cast', 'rating', 'listed_in', 'description', 'duration', 'duration_hours']


In [36]:
#  get the count of rows and columns
df.shape

(8805, 13)

In [33]:
# get the info 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8805 entries, 1 to 8805
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   show_id         8805 non-null   object        
 1   type            8805 non-null   object        
 2   title           8805 non-null   object        
 3   country         7974 non-null   object        
 4   release_year    8805 non-null   int64         
 5   date_added      8795 non-null   datetime64[ns]
 6   director        6172 non-null   object        
 7   cast            7980 non-null   object        
 8   rating          8801 non-null   object        
 9   listed_in       8805 non-null   object        
 10  description     8805 non-null   object        
 11  duration        8802 non-null   object        
 12  duration_hours  8802 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(11)
memory usage: 963.0+ KB


## There are 8805 rows, 13 columns, all except release_year is of type object.

## release_year is of type datetime. 
## Since released_year is just the year, int is fine and it doesnt have to be datetime.

# BASIC ANALYSIS :

## Question : What are the different types of genres available in the dataset?

In [78]:
# there is a white space that needs to removed
df['listed_in'][2].split(',')

['International TV Shows', ' TV Dramas', ' TV Mysteries']

In [80]:
# lstrip is used to remove the whitespace from the left side of a string
[i.lstrip() for i in (df['listed_in'][2]).split(',')]

['International TV Shows', 'TV Dramas', 'TV Mysteries']

In [133]:
# store the splited values into a variable
listed_in = df['listed_in'].apply(lambda x:x.split(','))
listed_in[:5]

id
1                                      [Documentaries]
2    [International TV Shows,  TV Dramas,  TV Myste...
3    [Crime TV Shows,  International TV Shows,  TV ...
4                            [Docuseries,  Reality TV]
5    [International TV Shows,  Romantic TV Shows,  ...
Name: listed_in, dtype: object

In [132]:
# create a function that removes white-space of all strings in a list
def remove_white(lst):
    if np.isnan(lst):
        return lst
    new_lst = []
    for i in lst:
        i = i.strip()
        new_lst.append(i)
    return new_lst

In [134]:
# apply the function mentioned above
listed_in = listed_in.apply(lambda x:white_space(x))
listed_in[:5]

id
1                                      [Documentaries]
2    [International TV Shows, TV Dramas, TV Mysteries]
3    [Crime TV Shows, International TV Shows, TV Ac...
4                             [Docuseries, Reality TV]
5    [International TV Shows, Romantic TV Shows, TV...
Name: listed_in, dtype: object

In [135]:
unique_listed_in = set()

In [136]:
def push_to_set(lst):
    for i in lst:
        unique_listed_in.add(i)

In [137]:
listed_in.apply(lambda x:push_to_set(x))

id
1       None
2       None
3       None
4       None
5       None
        ... 
8801    None
8802    None
8803    None
8804    None
8805    None
Name: listed_in, Length: 8805, dtype: object

In [138]:
# these are all the unique genres of shows/movies present in the dataset.
print(unique_listed_in)

{'TV Mysteries', 'Movies', 'Anime Series', 'Sci-Fi & Fantasy', 'TV Action & Adventure', 'Comedies', 'Faith & Spirituality', 'Spanish-Language TV Shows', 'Korean TV Shows', "Kids' TV", 'TV Shows', 'British TV Shows', 'Docuseries', 'Crime TV Shows', 'TV Sci-Fi & Fantasy', 'Cult Movies', 'Independent Movies', 'TV Thrillers', 'Teen TV Shows', 'International Movies', 'Romantic Movies', 'Music & Musicals', 'Thrillers', 'International TV Shows', 'Horror Movies', 'Anime Features', 'TV Dramas', 'LGBTQ Movies', 'Children & Family Movies', 'Stand-Up Comedy & Talk Shows', 'TV Comedies', 'Documentaries', 'Science & Nature TV', 'Stand-Up Comedy', 'Classic & Cult TV', 'Action & Adventure', 'TV Horror', 'Sports Movies', 'Classic Movies', 'Reality TV', 'Dramas', 'Romantic TV Shows'}


### NOTE :

In [104]:
# we could get the list of strings stripped of white-space
# just use ', ' with a space after , when you call split
df['listed_in'].apply(lambda x:x.split(', '))[2]

['International TV Shows', 'TV Dramas', 'TV Mysteries']

## Question : What are the different types available and how many of them ?

In [105]:
df['type'].value_counts()

Movie      6129
TV Show    2676
Name: type, dtype: int64

### There are 2 types: 6129 movies and 2676 TV Show's. 

## Question : Which countries have the highest number of shows/movies on Netflix?

In [116]:
# this is not the correct amount
df.groupby('country').count()['show_id'].sort_values(ascending=False).head(5)

country
United States     2818
India              972
United Kingdom     418
Japan              245
South Korea        199
Name: show_id, dtype: int64

In [120]:
# there are some rows with multiple countries
df['country'].value_counts()

United States                             2818
India                                      972
United Kingdom                             418
Japan                                      245
South Korea                                199
                                          ... 
Romania, Bulgaria, Hungary                   1
Uruguay, Guatemala                           1
France, Senegal, Belgium                     1
Mexico, United States, Spain, Colombia       1
United Arab Emirates, Jordan                 1
Name: country, Length: 748, dtype: int64

In [168]:
df['country'].isna().value_counts()

False    7974
True      831
Name: country, dtype: int64

In [170]:
# first lets remove the nans for ease of analysis
df['country'].fillna('not available', inplace=True)

In [202]:
# confirm no more null values
df['country'].isna().sum()

0

In [190]:
df['country'].replace('','not available', inplace=True)

In [200]:
(df['country'] == '').sum()

0

In [191]:
df['country'].isna().value_counts()

False    8805
Name: country, dtype: int64

In [192]:
countries = df['country'].apply(lambda x:x.split(', '))

In [193]:
unique_countries = set()

In [194]:
def push_to_set_2(lst):
    for i in lst:
        unique_countries.add(i)

In [195]:
countries.apply(lambda x:push_to_set_2(x))

id
1       None
2       None
3       None
4       None
5       None
        ... 
8801    None
8802    None
8803    None
8804    None
8805    None
Name: country, Length: 8805, dtype: object

In [196]:
unique_countries

{'',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Bermuda',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Cambodia',
 'Cambodia,',
 'Cameroon',
 'Canada',
 'Cayman Islands',
 'Chile',
 'China',
 'Colombia',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Dominican Republic',
 'East Germany',
 'Ecuador',
 'Egypt',
 'Ethiopia',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kuwait',
 'Latvia',
 'Lebanon',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Malawi',
 'Malaysia',
 'Malta',
 'Mauritius',
 'Mexico',
 'Mongolia',
 'Montenegro',
 'Morocco',
 'Mozambique',
 'Namibia',
 'Nepal',
 'Netherlands',
 'New Zealand',
 'Nicaragua',
 'N

In [206]:
country_dict = {key:0 for key in unique_countries}

country_dict

{'': 0,
 'South Africa': 0,
 'Philippines': 0,
 'not available': 0,
 'Belgium': 0,
 'Puerto Rico': 0,
 'Zimbabwe': 0,
 'Cuba': 0,
 'Australia': 0,
 'Pakistan': 0,
 'Ghana': 0,
 'Luxembourg': 0,
 'Sri Lanka': 0,
 'West Germany': 0,
 'Burkina Faso': 0,
 'Qatar': 0,
 'Malta': 0,
 'Italy': 0,
 'New Zealand': 0,
 'United Kingdom,': 0,
 'Argentina': 0,
 'Bangladesh': 0,
 'Sudan': 0,
 'Iran': 0,
 'Mozambique': 0,
 'United States': 0,
 'Canada': 0,
 'Vietnam': 0,
 'Ireland': 0,
 'Indonesia': 0,
 'Venezuela': 0,
 'Poland': 0,
 'Dominican Republic': 0,
 'Malawi': 0,
 'Saudi Arabia': 0,
 'Russia': 0,
 'Poland,': 0,
 'Nigeria': 0,
 'East Germany': 0,
 'Cambodia,': 0,
 'South Korea': 0,
 'Cambodia': 0,
 'Greece': 0,
 'Japan': 0,
 'Lebanon': 0,
 'Afghanistan': 0,
 'Iceland': 0,
 'Serbia': 0,
 'Algeria': 0,
 'Spain': 0,
 'Mauritius': 0,
 'Brazil': 0,
 'France': 0,
 'Mongolia': 0,
 'Malaysia': 0,
 'Hungary': 0,
 'Montenegro': 0,
 'Thailand': 0,
 'Colombia': 0,
 'Singapore': 0,
 'United Arab Emirates':

In [208]:
# remove unacessary keys
del country_dict['']

In [209]:
del country_dict['not available']

In [248]:
def update_dict(lst):
    try:
        for i in lst:
                country_dict[i] = country_dict[i] + 1  
    except:
        pass

In [249]:
countries.apply(lambda x:update_dict(x))

id
1       None
2       None
3       None
4       None
5       None
        ... 
8801    None
8802    None
8803    None
8804    None
8805    None
Name: country, Length: 8805, dtype: object

In [251]:
country_dict

{'South Africa': 64,
 'Philippines': 83,
 'Belgium': 90,
 'Puerto Rico': 1,
 'Zimbabwe': 3,
 'Cuba': 1,
 'Australia': 160,
 'Pakistan': 24,
 'Ghana': 5,
 'Luxembourg': 12,
 'Sri Lanka': 1,
 'West Germany': 5,
 'Burkina Faso': 1,
 'Qatar': 10,
 'Malta': 3,
 'Italy': 100,
 'New Zealand': 33,
 'United Kingdom,': 2,
 'Argentina': 91,
 'Bangladesh': 4,
 'Sudan': 1,
 'Iran': 4,
 'Mozambique': 1,
 'United States': 3690,
 'Canada': 445,
 'Vietnam': 7,
 'Ireland': 46,
 'Indonesia': 90,
 'Venezuela': 4,
 'Poland': 40,
 'Dominican Republic': 1,
 'Malawi': 1,
 'Saudi Arabia': 13,
 'Russia': 27,
 'Poland,': 1,
 'Nigeria': 102,
 'East Germany': 1,
 'Cambodia,': 1,
 'South Korea': 230,
 'Cambodia': 5,
 'Greece': 11,
 'Japan': 318,
 'Lebanon': 31,
 'Afghanistan': 1,
 'Iceland': 11,
 'Serbia': 7,
 'Algeria': 2,
 'Spain': 232,
 'Mauritius': 2,
 'Brazil': 97,
 'France': 392,
 'Mongolia': 1,
 'Malaysia': 26,
 'Hungary': 11,
 'Montenegro': 1,
 'Thailand': 70,
 'Colombia': 52,
 'Singapore': 41,
 'United Ara

In [254]:
pop_country = pd.DataFrame(list(country_dict.items()), columns=['Country','Count'])
pop_country.head()

Unnamed: 0,Country,Count
0,South Africa,64
1,Philippines,83
2,Belgium,90
3,Puerto Rico,1
4,Zimbabwe,3


In [258]:
pop_country.sort_values(by='Count',ascending=False).head(5).set_index('Country')

Unnamed: 0_level_0,Count
Country,Unnamed: 1_level_1
United States,3690
India,1046
United Kingdom,803
Canada,445
France,392


In [262]:
# confirming solution :
df['country'].str.contains('United States').sum()

3690

### United States has the most shows/movies (3690) 
### India is ranked #2 with 1046 movies. 