# Project 1: Explanatory Data Analysis & Data Presentation (Movies Dataset)

## Data Import and first Inspection

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.max_columns = 30
pd.options.display.float_format = '{:.2f}'.format

In [None]:
df = pd.read_csv("movies_complete.csv", parse_dates= ["release_date"])

In [None]:
df

In [None]:
df.info()

__Features__:

* **id:** The ID of the movie (clear/unique identifier).
* **title:** The Official Title of the movie.
* **tagline:** The tagline of the movie.
* **release_date:** Theatrical Release Date of the movie.
* **genres:** Genres associated with the movie.
* **belongs_to_collection:** Gives information on the movie series/franchise the particular film belongs to.
* **original_language:** The language in which the movie was originally shot in.
* **budget_musd:** The budget of the movie in million dollars.
* **revenue_musd:** The total revenue of the movie in million dollars.
* **production_companies:** Production companies involved with the making of the movie.
* **production_countries:** Countries where the movie was shot/produced in.
* **vote_count:** The number of votes by users, as counted by TMDB.
* **vote_average:** The average rating of the movie.
* **popularity:** The Popularity Score assigned by TMDB.
* **runtime:** The runtime of the movie in minutes.
* **overview:** A brief blurb of the movie.
* **spoken_languages:** Spoken languages in the film.
* **poster_path:** The URL of the poster image.
* **cast:** (Main) Actors appearing in the movie.
* **cast_size:** number of Actors appearing in the movie.
* **director:** Director of the movie.
* **crew_size:** Size of the film crew (incl. director, excl. actors).

In [None]:
df.genres[1]

In [None]:
df.cast[1]

In [None]:
df.describe()

In [None]:
df.hist(figsize = (20, 12), bins = 100)
plt.show()

In [None]:
df.budget_musd.value_counts(dropna = False).head(20)

In [None]:
df.revenue_musd.value_counts(dropna = False).head(20)

In [None]:
df.vote_average.value_counts(dropna = False)

In [None]:
df.vote_count.value_counts()

In [None]:
df.describe(include = "object")

In [None]:
df[df.title == "Cinderella"]

## The best and the worst movies... (Part 1)

In [None]:
from IPython.display import HTML
pd.options.display.max_colwidth = 200

In [None]:
df_best = df[["poster_path", "title", "budget_musd", "revenue_musd",
              "vote_count", "vote_average", "popularity"]].copy()
df_best

In [None]:
df_best["profit_musd"] = df.revenue_musd.sub(df.budget_musd)
df_best["return"] = df.revenue_musd.div(df.budget_musd)

In [None]:
df_best

In [None]:
df_best.columns = ["", "Title", "Budget", "Revenue", "Votes", 
                   "Average Rating", "Popularity", "Profit", "ROI"]

In [None]:
df_best.set_index("Title", inplace = True)

In [None]:
df_best

In [None]:
df_best.iloc[0,0]

In [None]:
subset = df_best.iloc[:5, :2]
subset

In [None]:
HTML(subset.to_html(escape=False))

In [None]:
df_best.sort_values(by = "Average Rating", ascending = False)

In [None]:
df_best.sort_values(by = "ROI", ascending = False)

In [None]:
df_best.loc[df_best.Budget >= 5].sort_values(by = "ROI", ascending = False)

In [None]:
df_best.Budget.fillna(0, inplace = True)
df_best.Votes.fillna(0, inplace = True)

In [None]:
df_best.info()

In [None]:
def best_worst(n, by, ascending = False, min_bud = 0, min_votes = 0):
    
    
    df2 = df_best.loc[(df_best.Budget >= min_bud) & (df_best.Votes >= min_votes), 
                      ["", by]].sort_values(by = by, ascending = ascending).head(n).copy()
    
    return HTML(df2.to_html(escape=False))

## The best and the worst movies... (Part 2)

__Movies Top 5 - Highest Revenue__

In [None]:
best_worst(n = 5, by = "Revenue")

__Movies Top 5 - Highest Budget__

In [None]:
best_worst(5, "Budget")

__Movies Top 5 - Highest Profit__

In [None]:
best_worst(5, "Profit")

__Movies Top 5 - Lowest Profit__

In [None]:
best_worst(5, "Profit", ascending = True)

__Movies Top 5 - Highest ROI__

In [None]:
best_worst(5, "ROI", min_bud = 50)

__Movies Top 5 - Lowest ROI__

In [None]:
best_worst(5, "ROI", ascending = True, min_bud = 100)

__Movies Top 5 - Most Votes__

In [None]:
best_worst(5, "Votes")

__Movies Top 5 - Highest Rating__

In [None]:
best_worst(5, "Average Rating", min_votes = 50)

__Movies Top 5 - Lowest Rating__

In [None]:
best_worst(5, "Average Rating", ascending = True, min_votes = 100)

In [None]:
best_worst(5, "Average Rating", ascending = True, min_votes = 20, min_bud = 20)

__Movies Top 5 - Most Popular__

In [None]:
best_worst(5, "Popularity")

## Find your next Movie

__Search 1: Science Fiction Action Movie with Bruce Willis (high Rating)__

In [None]:
df.genres[0]

In [None]:
mask_genres = df.genres.str.contains("Action") & df.genres.str.contains("Science Fiction")
mask_genres

In [None]:
df.cast[0]

In [None]:
mask_actor = df.cast.str.contains("Bruce Willis")
mask_actor

In [None]:
df.loc[mask_actor & mask_genres, ["title", "vote_average"]].sort_values(by = "vote_average", 
                                                                        ascending = False)

In [None]:
bruce = df.loc[mask_actor & mask_genres, ["title", "poster_path", "vote_average"]].sort_values(by = "vote_average", ascending = False).set_index("title")

In [None]:
HTML(bruce.to_html(escape=False))

__Search 2: Movies with Uma Thurman and directed by Quentin Tarantino (low runtime)__

In [None]:
df.director

In [None]:
mask_director = df.director == "Quentin Tarantino"

In [None]:
mask_actor = df.cast.str.contains("Uma Thurman")

In [None]:
quentin = df.loc[mask_director & mask_actor, 
               ["title", "poster_path", "runtime"]].sort_values(by = "runtime").set_index("title")

In [None]:
HTML(quentin.to_html(escape=False))

__Search 3: Most Successful Pixar Studio Movies between 2010 and 2015 (high Revenue)__

In [None]:
df.production_companies[1]

In [None]:
mask_studio = df.production_companies.str.contains("Pixar").fillna(False)

In [None]:
df.release_date

In [None]:
mask_time = df.release_date.between("2010-01-01", "2015-12-31")

In [None]:
pixar = df.loc[mask_studio & mask_time, 
               ["title", "poster_path", "revenue_musd", "release_date"]].sort_values(by = "revenue_musd", ascending = False).set_index("title")

In [None]:
HTML(pixar.to_html(escape=False))

__Search 4: Action or Thriller Movie with original language English and minimum Rating of 7.5 (most recent)__

In [None]:
mask_genre = df.genres.str.contains("Action") | df.genres.str.contains("Thriller")

In [None]:
mask_lan = df.original_language == "en"

In [None]:
mask_vote_av = df.vote_average >= 7.5 

In [None]:
mask_vote_co = df.vote_count >= 10

In [None]:
next_mov = df.loc[mask_genre & mask_lan & mask_vote_av & mask_vote_co, 
               ["title", "poster_path", "genres", "vote_average", "vote_count", "release_date"]].sort_values(by = "release_date", ascending = False).set_index("title").head(20)

In [None]:
HTML(next_mov.to_html(escape=False))

## What are the most common Words in Movie Titles and Taglines?

In [None]:
from wordcloud import WordCloud

In [None]:
df

In [None]:
df.tagline[1]

In [None]:
df.overview[1]

In [None]:
title = df.title.dropna()
overview = df.overview.dropna()
tagline = df.tagline.dropna()

In [None]:
title

In [None]:
' '.join(title)

In [None]:
title_corpus = ' '.join(title)
overview_corpus = ' '.join(overview)
tagline_corpus = ' '.join(tagline)

In [None]:
tagline_corpus

In [None]:
title_wordcloud = WordCloud(background_color='white', height=2000, width=4000, max_words= 200).generate(title_corpus)
title_wordcloud

In [None]:
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud, interpolation= "bilinear")
plt.axis('off')
plt.show()

In [None]:
tagline_wordcloud = WordCloud(background_color='white', height=2000, width=4000).generate(tagline_corpus)
plt.figure(figsize=(16,8))
plt.imshow(tagline_wordcloud, interpolation= "bilinear")
plt.axis('off')
plt.show()

In [None]:
overview_wordcloud = WordCloud(background_color='white', height=2000, width=4000).generate(overview_corpus)
plt.figure(figsize=(16,8))
plt.imshow(overview_wordcloud, interpolation= "bilinear")
plt.axis('off')
plt.show()

## Are Franchises more successful?

In [None]:
df.belongs_to_collection

In [None]:
df["Franchise"] = df.belongs_to_collection.notna()

In [None]:
df.Franchise

In [None]:
df.Franchise.value_counts()

__Franchise vs. Stand-alone: Average Revenue__

In [None]:
df.groupby("Franchise").revenue_musd.mean()

__Franchise vs. Stand-alone: Return on Investment / Profitability__

In [None]:
df["ROI"] = df.revenue_musd.div(df.budget_musd)

In [None]:
df.groupby("Franchise").ROI.median()

__Franchise vs. Stand-alone: Average Budget__

In [None]:
df.groupby("Franchise").budget_musd.mean()

__Franchise vs. Stand-alone: Average Popularity__

In [None]:
df.groupby("Franchise").popularity.mean()

__Franchise vs. Stand-alone: Average Rating__

In [None]:
df.groupby("Franchise").vote_average.mean()

In [None]:
df.groupby("Franchise").agg({"budget_musd": "mean", "revenue_musd": "mean", "vote_average": "mean",
                            "popularity": "mean", "ROI":"median", "vote_count":"mean"})

## Most Successful Franchises

In [None]:
df.belongs_to_collection

In [None]:
df.belongs_to_collection.value_counts()

In [None]:
franchises = df.groupby("belongs_to_collection").agg({"title":"count", "budget_musd": ["sum", "mean"], 
                                                      "revenue_musd": ["sum", "mean"],
                                                      "vote_average": "mean", "popularity": "mean",
                                                      "ROI":"median", 
                                                      "vote_count":"mean"})

In [None]:
franchises

In [None]:
franchises.nlargest(20, ("title", "count"))

In [None]:
franchises.nlargest(20, ("revenue_musd", "mean"))

In [None]:
franchises.nlargest(20, ("budget_musd", "mean"))

In [None]:
franchises[franchises[("vote_count", "mean")] >=1000].nlargest(20, ("vote_average", "mean"))

## Most Successful Directors

In [None]:
df.director

In [None]:
df.director.value_counts().head(20)

In [None]:
plt.figure(figsize = (12, 8))
df.director.value_counts().head(20).plot(kind='bar', fontsize = 15)
plt.title("Most Active Directors",fontsize = 20)
plt.ylabel("Number of Movies", fontsize = 15)
plt.show()

In [None]:
df.groupby("director").revenue_musd.sum().nlargest(20)

In [None]:
plt.figure(figsize = (12, 8))
df.groupby("director").revenue_musd.sum().nlargest(20).plot(kind='bar', fontsize = 15)
plt.title("Total Revenue",fontsize = 20)
plt.ylabel("Revenue (in MUSD)", fontsize = 15)
plt.show()

In [None]:
directors = df.groupby("director").agg({"title": "count", "vote_average" :"mean", "vote_count": "sum"})

In [None]:
directors

In [None]:
directors[(directors.vote_count >= 10000) & (directors.title >= 10)].nlargest(20, "vote_average")

In [None]:
df.genres = df.genres.astype(str)

In [None]:
df.loc[df.genres.str.contains("Horror")].groupby("director").revenue_musd.sum().nlargest(20)

## Most Successful Actors (Part 1)

In [None]:
df.cast

In [None]:
df

In [None]:
df.set_index("id", inplace = True)

In [None]:
df.info()

In [None]:
df.cast

In [None]:
df.cast.str.split("|", expand = True)

In [None]:
act = df.cast.str.split("|", expand = True)
act

In [None]:
act.stack().reset_index(level=1, drop=True).to_frame()

In [None]:
act = act.stack().reset_index(level=1, drop=True).to_frame()

In [None]:
act

In [None]:
act.columns = ["Actor"]

In [None]:
act = act.merge(df[["title", "revenue_musd", "vote_average", "popularity"]],
                how = "left", left_index = True, right_index = True)

In [None]:
act

## Most Successful Actors (Part 2)

In [None]:
act

In [None]:
act.Actor.nunique()

In [None]:
act.Actor.unique()

In [None]:
act.Actor.value_counts().head(20)

In [None]:
plt.figure(figsize = (12, 8))
act.Actor.value_counts().head(20).plot(kind='bar', fontsize = 15)
plt.title("Most Active Actors",fontsize = 20)
plt.ylabel("Number of Movies", fontsize = 15)
plt.show()

In [None]:
agg = act.groupby("Actor").agg(Total_Revenue = ("revenue_musd", "sum"), 
                               Mean_Revenue = ("revenue_musd", "mean"),
                               Mean_Rating = ("vote_average", "mean"), 
                               Mean_Pop = ("popularity", "mean"), 
                               Total_Movies = ("Actor", "count"))

In [None]:
agg.nlargest(10, "Total_Movies")

In [None]:
agg.nlargest(10, "Total_Revenue")

In [None]:
plt.figure(figsize = (12, 8))
agg.Total_Revenue.nlargest(10).plot(kind='bar', fontsize = 15)
plt.title("Total Revenue",fontsize = 20)
plt.ylabel("Revenue (in MUSD)", fontsize = 15)
plt.show()

In [None]:
agg.Mean_Revenue.nlargest(10)

In [None]:
act[act.Actor == "Ashley Jeffery"]

In [None]:
agg[agg.Total_Movies >= 10].nlargest(10, "Mean_Revenue")

In [None]:
agg[agg.Total_Movies >= 10].nlargest(10, "Mean_Rating")

In [None]:
agg[agg.Total_Movies >= 10].nlargest(10, "Mean_Pop")