# Netflix Data Analysis
von Daniel Henke & Stefanie Wenzel

### Daten importieren und aufräumen

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
df = pd.read_csv('../data/netflix-rotten-tomatoes-metacritic-imdb.csv')
df=df.drop(columns=["Image", "Poster", "TMDb Trailer", "Trailer Site", "Netflix Link", "IMDb Link", 'Summary', "Writer", "Genre", "Tags", "Hidden Gem Score", "Runtime", "Actors", "View Rating", "Netflix Release Date", "Production House", "IMDb Votes"])
df

In [None]:
# Kategorisieren
df['Release Date'] = pd.to_datetime(df['Release Date'], format = '%d %b %Y')

# Dollarzeichen und Komma entfernen
Boxoffice = df['Boxoffice']

df['Boxoffice'] = df['Boxoffice'].replace({r'\$' : ''}, regex = True)

df['Boxoffice'] = df['Boxoffice'].replace({r'\,' : ''}, regex = True)

df['Boxoffice'] = pd.to_numeric(df['Boxoffice'])


def funct(x):
   if x=="leer":
       return None
   else:
       return len(x.split(","))
       
       
for c in ["Languages", "Country Availability"]:
    df[c]=df[c].fillna("leer")
    df[c]=df[c].apply(funct)

df


In [None]:
# fehlende Werte abfragen
print(df.isnull().sum())

for i in df.columns:
    missing_percentage = 100 * df[i].isna().sum()/len(df)
    print('{} missing value percentage: {}%'.format(i, round(missing_percentage, 2)))

### Filme vs Serien

In [None]:
MvsS = df.groupby("Series or Movie").mean()
MvsS

### Release Year

In [None]:
def funct(x):
    return x.year

TimeVsLanguage = df
TimeVsLanguage["Release Date"] = TimeVsLanguage["Release Date"].apply(funct).dropna()
Time=TimeVsLanguage.groupby("Release Date").size()
TimeVsLanguage= TimeVsLanguage.groupby("Release Date").mean()

plt.title("Amount of Netflix Movies based on Release Year")
plt.xlabel("Release Year")
plt.plot(Time)

In [None]:
TimeVsCountry=TimeVsLanguage[TimeVsLanguage.index>1979]
TimeVsCountry=TimeVsCountry[TimeVsCountry.index<2020]
plt.title("Country Availability based on Release Year")
plt.xlabel("Release Year between 1980 and 2019")
plt.ylabel("Country Availability")
plt.plot(TimeVsCountry.index, TimeVsCountry["Country Availability"])

### Countries & Languages

In [None]:
plt.title("Languages vs Countries")
plt.xlabel("Amount of Languages")
plt.ylabel("Country Availability")
plt.scatter(df["Languages"], df["Country Availability"])

In [None]:
ContCount = df.groupby("Country Availability").mean()
plt.title("Countries vs Languages")
plt.ylabel("Amount of Languages")
plt.xlabel("Country Availability")
plt.plot(ContCount.index, ContCount["Languages"])

In [None]:
LangCount = df.groupby("Languages").mean()
plt.title("Languages vs Countries")
plt.xlabel("Amount of Languages")
plt.ylabel("Country Availability")
plt.plot(LangCount.index, LangCount["Country Availability"])


### IMDb Score

In [None]:
counter = df.groupby("IMDb Score").size()
plt.title("Number of Movies & Series per IMDb Score")
plt.xlabel("IMDb Score")
plt.plot(counter)


In [None]:
ScorevsCountry = df.groupby("IMDb Score").median()
plt.title("IMDb Score vs Country Availability")
plt.xlabel("IMDb Score")
plt.plot(ScorevsCountry.index, ScorevsCountry["Country Availability"])

### Best Director

In [None]:
# Welcher Direktor macht die besten Filme?

DirectorVsTitle = df[['Director', 'Title']]

DirectorVsTitle = DirectorVsTitle.groupby('Director').size()

DirectorVsTitle


In [None]:

# Director vs. IMDb Score
DirectorVsIMDb = df[['Director', 'IMDb Score', "Boxoffice"]]

DirectorVsIMDb = DirectorVsIMDb.groupby('Director').mean()

DirectorVsIMDb.sort_values('IMDb Score', ascending =False)


In [None]:

# Merge
Merge = pd.concat([DirectorVsTitle, DirectorVsIMDb], axis = 1, join ='inner')
Merge=Merge.rename(columns={0:"Amount of Movies"})

# Bedingung für DirectorVsIMDb, ab 3 Titeln
Merge = Merge[Merge["Amount of Movies"]>2].sort_values('IMDb Score', ascending =False)

Merge[["Amount of Movies", "IMDb Score"]].head(20)

In [None]:
Merge.sort_values('Boxoffice', ascending =False).head(20)

### Geld vs Gut

In [None]:
MoneyVsGood=df
MoneyVsGood["Boxoffice"]=MoneyVsGood["Boxoffice"].dropna()
Money=MoneyVsGood.groupby("Boxoffice").mean()
plt.title("How good is a movie or series is based on how much money it made")
plt.xlabel("Boxoffice Earnings")
plt.ylabel("IMDb Score")
plt.scatter(Money.index, Money["IMDb Score"])

In [None]:
Good=MoneyVsGood.groupby("IMDb Score").mean()
plt.scatter(Good.index, Good["Boxoffice"])
plt.ylabel("Boxoffice Earnings")
plt.xlabel("IMDb Score")
plt.title("Boxoffice Earnings based on IMDb Score")