In [3]:
# Load modules, data + initialize configuration
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

PALETTE = 'magma'
sns.set_theme(style="white")

In [4]:
plt.rcParams["font.family"] = "monospace"

In [5]:
df = pd.read_csv("../input/netflix-tv-shows-and-movies/titles.csv")

In [6]:
df.head()

## Release Year Comparison 


In [7]:
# Get data where `imdb_votes` and `tmdb_score` are not null
df_cleaned = df[(~df['imdb_score'].isnull()) & (~df['tmdb_score'].isnull())]

df_1 = df_cleaned.copy()

# Group the release year by decade
def get_decade(val):
  return int(str(val)[0:-1] + "0")

df_1['decade'] = df_1['release_year'].map(get_decade)
df_1['decade'].value_counts()

In [9]:
# Make bar plot of `decade` vs `score`
decade_score_relations = df_1.groupby('decade').aggregate({"imdb_score": "mean", "tmdb_score": "mean"})

In [10]:
sns.barplot(x=decade_score_relations.index, y=decade_score_relations["imdb_score"], palette=PALETTE)

In [11]:
sns.regplot(x=decade_score_relations.index, y=decade_score_relations['imdb_score']).set(xlabel='Decade', ylabel='Score (IMDB)', title='Decade vs Score (IMDB)')

In [12]:
sns.scatterplot(x=df_1['release_year'], y=df_1['imdb_score'], hue=df_1['type'])
plt.show()
sns.regplot(x=df_1['release_year'], y=df_1['imdb_score'])

In [15]:
popularity_info = df_1.groupby('decade').aggregate({"tmdb_popularity": "mean"})

In [16]:
sns.barplot(x=popularity_info.index, y=popularity_info['tmdb_popularity'], palette=PALETTE).set(xlabel="Decade", ylabel="Popularity (TMDB)", title="Decade vs Popularity (TMDB)")

In [17]:
df_1[df_1['decade'] == 1980].sort_values(by='tmdb_popularity', ascending=False)['title'].head(10)

Moreover, here are the titles with the highest TMDB popularity

In [18]:
df.sort_values(by='tmdb_popularity', ascending=False)['title'].head(10)

## Age Certification vs Score 

In [19]:
# Count the number of null values in the dataset
df['age_certification'].isnull().sum()

In [21]:
# Fill null values with mode
df['age_certification'].fillna(df['age_certification'].mode()[0], inplace=True)

In [22]:
df_2 = df[(~df['imdb_score'].isnull()) & (~df['tmdb_score'].isnull())]

In [23]:
age_score = df_2.groupby("age_certification").aggregate({"imdb_score": "mean", "tmdb_score": "mean", "tmdb_popularity": "mean"})

In [24]:
# Visualize IMDb/TMDB scores
plt.figure(figsize=(12, 6))
sns.barplot(x=age_score.index, y=age_score['imdb_score'], palette=PALETTE).set(xlabel="Age Certification", ylabel="Score (IMDb)", title="Age Cert. vs. Score (IMDb)")
plt.show()
plt.figure(figsize=(12, 6))
sns.barplot(x=age_score.index, y=age_score['tmdb_score'], palette=PALETTE).set(xlabel="Age Certification", ylabel="Score (TMDB)", title="Age Cert. vs Score (TMDB)")

In [25]:
# Visualize popularities of each age certification
plt.figure(figsize=(12, 6))
sns.barplot(x=age_score.index, y=age_score['tmdb_popularity'], palette=PALETTE).set(xlabel='Age Certification', ylabel='TMDB Popularity', title='Age Cert. vs. Popularity')
plt.figure(figsize=(12, 6))
cert_sorted_pop = age_score.sort_values(by='tmdb_popularity', ascending=False)
sns.barplot(x=cert_sorted_pop.index, y=cert_sorted_pop['tmdb_popularity'], palette=PALETTE).set(xlabel='Age Certification', ylabel='TMDB Popularity', title='Age Cert. vs. Popularity (sorted)')

In [26]:
certifications = ['G', 'NC-17', 'PG', 'PG-13', 'R', 'TV-14', 'TV-G', 'TV-MA', 'TV-PG', 'TV-Y', 'TV-Y7']
for i in certifications:
  cert_values = df_2[df_2['age_certification'] == i]

  sns.histplot(cert_values['imdb_score'], kde=True).set(xlabel=i, title=f"Distribution of {i} w.r.t score (IMDb)")
  plt.show()

In [27]:
df['genres'].value_counts()

In [28]:
# Parse genre column
genres = {}

def get_genres(row):
  parsed = (str(row)[1:-1]).split(",")

  for i in range(len(parsed)):
    parsed[i] = parsed[i].strip()
    parsed[i] = parsed[i][1:-1]

  for i in parsed:
    if i not in genres.keys():
      genres[i] = 0
      continue
    genres[i] += 1

  return row

df['genres'] = df['genres'].map(get_genres)
genres

In [29]:
def transform_genres(row):
  parsed = (str(row)[1:-1]).split(",")

  for i in range(len(parsed)):
    parsed[i] = parsed[i].strip()[1:-1]

  for i in parsed:
    if i not in genres.keys():
      genres[i] = 0
      continue
    genres[i] += 1

  return parsed[0] if parsed[0] != '' else 'none'

In [30]:
# Perform the transformation
df['genres_transformed'] = df['genres'].map(transform_genres)
df['genres_transformed'].value_counts()

In [31]:
df['genres_transformed'].replace(to_replace='documentation', value='doc', inplace=True)
plt.figure(figsize=(20,10))
sns.histplot(df['genres_transformed'])

In [32]:
df_3 = df[(~df['imdb_score'].isnull()) & (~df['tmdb_score'].isnull())]
df_3.head()

In [33]:
genre_vs_score = df_3.groupby("genres_transformed").aggregate({"imdb_score": "mean", "tmdb_score": "mean", "tmdb_popularity": "mean"})
genre_vs_score.head()

In [35]:
plt.figure(figsize=(18,9))
sns.barplot(x=genre_vs_score.index, y=genre_vs_score['imdb_score'], palette=PALETTE).set(xlabel="Genre", ylabel="Score (IMDb)", title="Genre vs. Score (IMDb)")

In [37]:
plt.figure(figsize=(18,9))
sns.barplot(x=genre_vs_score.index, y=genre_vs_score['tmdb_popularity'], palette=PALETTE).set(xlabel="Genre", ylabel="Popularity (TMDB)", title="Genre vs. Popularity (TMDB)")

In [38]:
sorted_imdb = genre_vs_score.sort_values(by="imdb_score", ascending=False)
plt.figure(figsize=(18, 9))
sns.barplot(x=sorted_imdb.index, y=sorted_imdb['imdb_score'], palette=PALETTE).set(xlabel='Genre', ylabel='IMDb Rating', title='Genre vs. Rating (IMDb)')

In [39]:
sorted_tmdb = genre_vs_score.sort_values(by="tmdb_score", ascending=False)
plt.figure(figsize=(18, 9))
sns.barplot(x=sorted_tmdb.index, y=sorted_tmdb['tmdb_score'], palette=PALETTE).set(xlabel='Genre', ylabel='TMDB Rating', title='Genre vs. Rating (TMDB)')

In [40]:
sorted_popularity = genre_vs_score.sort_values(by='tmdb_popularity', ascending=False)
plt.figure(figsize=(18,9))
sns.barplot(x=sorted_popularity.index, y=sorted_popularity['tmdb_popularity'], palette=PALETTE).set(xlabel='Genre', ylabel='Popularity (TMDB)', title='Genre vs. Popularity (TMDB)')

### Genres vs. Score 

In [47]:
plt.figure(figsize=(18,9))
sns.barplot(x=genres_ratings_shows.index, y=genres_ratings_shows["tmdb_score"], palette=PALETTE).set(xlabel="Genre", ylabel="Rating (TMDB)", title="Genre vs. Rating (shows only)")