In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preparation

## Load the dataset

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
df.info()

## Setting the index

In [None]:
df.set_index('show_id')
df.info()

In [None]:
df.head()

# Data Cleaning

## Checking the null or missing values

In [None]:
df.isna().sum()  # find null values

## Changed the variable to date type

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'])
df['added_year'] = df['date_added'].dt.year
df['added_day'] = df['date_added'].dt.day_name()
df.head()

## Missing values Treatment

In [None]:
df['added_year'].fillna(df['added_year'].mean(), inplace=True)
df['added_year'] = df['added_year'].astype(int)

df.head()

In [None]:
df.fillna('Data Not Available', inplace=True)
df.isna().sum()  


# EDA

### Which type of shows are most available?


In [None]:
# Movies are more in number
plt.pie(df['type'].value_counts(), labels=['Movie', 'TV Show'])
plt.show()

### Top 10 countries of the shows in netflix


In [None]:
# India, US have high movies count
# US and UK have high number of tv shows 
plt.figure(figsize=(15,10))
tmp = df[df['country'] != 'Data Not Available']
sns.countplot(x='country', data=tmp, order=tmp['country'].value_counts().index[:10], hue= 'type' )
plt.show()

### Trend of the shows added to netflix. Is it more number of tv_shows or movies ?

In [None]:
# More movies than tv shows getting added to netflix
# during 2019 to 2020 period, most number of shows were added

plt.figure(figsize=(15,8))
movie = df[df['type'] == 'Movie']
tv = df[df['type'] == 'TV Show']
added_counts= movie['added_year'].value_counts()
added_tv_counts= tv['added_year'].value_counts()
sns.lineplot(x=added_counts.index,y=added_counts.values, color="orange")
sns.lineplot(x=added_tv_counts.index,y=added_tv_counts.values, color="blue")
plt.show()

### Top ratings of the movies in last century


In [None]:
# Most movies, tv shows have TV-14 rating followed by TV-MA and TV-PG
# No tv shows available with rating as R, PG-13, PG and G
plt.figure(figsize=(25,5))
shows= df[(df['release_year'] < 2000)]
sns.countplot(x='rating', data =shows, order=shows['rating'].value_counts().index[:10], hue='type')
plt.show()

### Popular months with most netflix movies released

In [None]:
# Most of the shows are released on Fridays

months_data = df[df['added_day'] != 'Data Not Available']
months_data =months_data.groupby('added_day')
months_data=months_data.size()
months_data.plot(kind="bar", y=["Type"])
plt.title('Month wise releases in netflix')
plt.ylabel('Counts')
plt.show()

### Movies with less duration

In [None]:
# Most of the children, sci-fi & Fantasy movies and documentaries take less amount of time
tmp_info = df.copy()
tmp_info = tmp_info[(tmp_info['type'] == 'Movie') & (tmp_info['duration'] != 'Data Not Available')]
tmp_info["duration"] = tmp_info["duration"].str.replace("min", "")
tmp_info['duration'] =tmp_info['duration'].astype(int)
tmp_info = tmp_info.sort_values(by='duration', ascending=True)[:15]
tmp_info.head()

plt.figure(figsize=(18,10))
sns.barplot(y='title', x='duration', data=tmp_info, hue='listed_in')
plt.show()

# EDA on Indian Shows

### Top Ratings indian movies

In [None]:
indian_shows = df[df['country'] == 'India']
indian_shows

# Ratings of indian movies
# Most of the indian shows have TV-PG, TV-14 and TV-MA ratings.
indian_shows['rating'].value_counts().plot.pie(figsize=(12,10))
plt.show()


### Top Genres from india


In [None]:
# Comedy and Drama are most common genres of the shows produced in India
plt.figure(figsize=(15,7))
sns.countplot(y = indian_shows['listed_in'], order =indian_shows['listed_in'].value_counts().index[:10] )
plt.show()

### Most of the 'David Dhawan' director movies are available in Netflix


In [None]:
print(indian_shows['director'].value_counts())

David_dhawan_shows = indian_shows[indian_shows['director'] == 'David Dhawan']
David_dhawan_shows.loc[:,['release_year','added_year', 'title', 'duration','listed_in']]

### Indian Actors in most netflix movies

In [None]:
tmp=pd.DataFrame() 
tmp=df['cast'].str.split(',',expand=True).stack().to_frame() 
tmp.columns=['actors']

actor_info=tmp.groupby(['actors']).size().reset_index(name='Total Shows')
actor_info = actor_info[actor_info['actors'] != 'Data Not Available']
actor_info = actor_info.sort_values(by='Total Shows', ascending=False)[:15]
sns.barplot(y='actors', x='Total Shows', data=actor_info)
plt.show()


### Indian Horror TV Shows available in Netflix

In [None]:
t = df[(df['listed_in'].str.contains('Horror')) & (df['country'] == 'India') & (df['type'] == 'TV Show')]
result = t.loc[:,['title','release_year', 'description']]
result