In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("dataset/netflix_titles.csv")
df.head()

In [None]:
df.info()

## Shows and movies by released year

In [None]:
df.loc[:, "type"].value_counts()

In [None]:
df.loc[:, "release_year"].min(), df.loc[:, "release_year"].max()

In [None]:
releases_by_year = df.groupby(["release_year"]).count().loc[:, "type"]
releases_by_year.values

In [None]:
from matplotlib import pyplot as plt

%matplotlib inline

plt.plot(releases_by_year.index, releases_by_year.values, "g-")
plt.show()

In [None]:
# min, max, average duration of the movies in netflix
movies_df = df.loc[df["type"] == "Movie"]

movies_df["duration_in_sec"] = movies_df.loc[:, "duration"].apply(
    lambda v: pd.Timedelta(v).total_seconds()
)
movies_df.head()

movies_df["duration_in_sec"].mean(), movies_df["duration_in_sec"].min(), movies_df[
    "duration_in_sec"
].max()

In [None]:
# longest running movie
movies_df.loc[movies_df.loc[:, "duration_in_sec"].max() == movies_df.loc[:, "duration_in_sec"]]

In [None]:
# top 5 longest movies
movies_df.sort_values("duration_in_sec", ascending=False).head(5)

In [None]:
# movies added to netflix year on year
from datetime import datetime

movies_df["year_added"] = movies_df.loc[:, "date_added"].apply(
    lambda datestr: datestr if pd.isna(datestr) else datetime.strptime(datestr, "%B %d, %Y").year
)
movies_added_by_year = (
    movies_df.groupby("year_added", as_index=False).count().loc[:, ["year_added", "show_id"]]
)
movies_added_by_year

In [None]:
# Number of indian movies added to netflix year on year
indian_movies = movies_df.loc[movies_df.loc[:, "country"].isin(["India"])]
indian_movies_added_per_year = (
    indian_movies.groupby("year_added", as_index=False).count().loc[:, ["year_added", "show_id"]]
)
indian_movies_added_per_year

In [None]:
# tv shows added by year on year
tv_shows_df = df.loc[df.loc[:, "type"] == "TV Show"]
tv_shows_df["year_added"] = tv_shows_df.loc[:, "date_added"].apply(
    lambda datestr: datestr
    if pd.isna(datestr)
    else datetime.strptime(datestr.strip(), "%B %d, %Y").year
)
tv_shows_df.groupby("year_added", as_index=False).count().loc[:, ["year_added", "show_id"]]

indian_tv_shows = tv_shows_df.loc[df.loc[:, "country"].isin(["India"])]
indian_tv_shows.groupby("year_added", as_index=False).count().loc[:, ["year_added", "show_id"]]

In [None]:
# tv shows by season count
tv_shows_df = tv_shows_df.assign(
    seasons=tv_shows_df["duration"].apply(lambda v: int(v.split(" ")[0]))
)
tv_shows_df.sort_values("seasons", ascending=False).head(5)

In [None]:
movies_with_country_df = movies_df.assign(country=movies_df.country.str.split(", ")).explode(
    "country"
)
movies_with_country_df.head()
movies_by_country_df = (
    movies_with_country_df.groupby("country", as_index=False).count().loc[:, ["country", "show_id"]]
)
movies_by_country_df.sort_values("show_id", ascending=False).head(20).plot(
    kind="bar", x="country", y="show_id"
)

In [None]:
# number of movies releases in netflix by year, country
movies_over_years = movies_with_country_df.groupby(["release_year", "country"], as_index=False)[
    "show_id"
].count()
movies_over_years = movies_over_years.loc[movies_over_years["release_year"] > 1990]
movies_over_years.columns.values[2] = "count"
movies_over_years
# movies_over_years.plot(kind="line", x="release_year", y="show_id")

In [None]:
# movies by genres added in between 2012 and 2019
import seaborn as sns

movies_genre_df = (
    movies_df.loc[(movies_df["year_added"] >= 2012) & (movies_df["year_added"] <= 2019)]
    .assign(genre=movies_df.loc[:, "listed_in"].str.split(", "))
    .explode("genre")
)
movies_by_genre_df = (
    movies_genre_df.groupby(["year_added", "genre"], as_index=False)
    .count()
    .loc[:, ["year_added", "genre", "show_id"]]
    .sort_values("year_added")
)

fig, ax = plt.subplots(1, 1, figsize=(12, 8))
sns.lineplot(x="year_added", y="show_id", data=movies_by_genre_df, hue="genre", ax=ax)
ax.set_title("Movies added by genre between 2012 and 2019")
plt.legend()
plt.show()

In [None]:
df_directors_exp = df.assign(director=df.director.str.split(", ")).explode("director")
df_directors = df_directors_exp[["show_id", "type", "director"]]
df_directors = df_directors[df_directors["type"] == "Movie"]
directors_count = df_directors["director"].value_counts().sort_values(ascending=False).reset_index()
directors_count.head()
# directors_count.columns.values[0]='director'
# directors_count.columns.values[1]='number of movies'
# directors_count = directors_count.set_index(np.arange(1,len(directors_count)+1))
# print(directors_count.head(25))

In [None]:
df_directors_exp.loc[df_directors_exp.loc[:, "director"].isin(["Jan Suter"])]