# Imports

In [1]:
import pandas as pd

# Data import and first inspection

In [21]:
data = pd.read_csv(r"C:\Users\CPL17\OneDrive\Documents\Data\movies_complete.csv")\
         .dropna(subset=['revenue_musd','budget_musd'])

* **id:** The ID of the movie (clear/unique identifier).
* **title:** The Official Title of the movie.
* **tagline:** The tagline of the movie.
* **release_date:** Theatrical Release Date of the movie.
* **genres:** Genres associated with the movie.
* **belongs_to_collection:** Gives information on the movie series/franchise the particular film belongs to.
* **original_language:** The language in which the movie was originally shot in.
* **budget_musd:** The budget of the movie in million dollars.
* **revenue_musd:** The total revenue of the movie in million dollars.
* **production_companies:** Production companies involved with the making of the movie.
* **production_countries:** Countries where the movie was shot/produced in.
* **vote_count:** The number of votes by users, as counted by TMDB.
* **vote_average:** The average rating of the movie.
* **popularity:** The Popularity Score assigned by TMDB.
* **runtime:** The runtime of the movie in minutes.
* **overview:** A brief blurb of the movie.
* **spoken_languages:** Spoken languages in the film.
* **poster_path:** The URL of the poster image.
* **cast:** (Main) Actors appearing in the movie.
* **cast_size:** number of Actors appearing in the movie.
* **director:** Director of the movie.
* **crew_size:** Size of the film crew (incl. director, excl. actors).

data.columns

In [22]:
data.head(3)

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,crew_size,director
0,862,Toy Story,,1995-10-30,Animation|Comedy|Family,Toy Story Collection,en,30.0,373.554033,Pixar Animation Studios,...,7.7,21.946943,81.0,"Led by Woody, Andy's toys live happily in his ...",English,<img src='http://image.tmdb.org/t/p/w185//uXDf...,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,13,106,John Lasseter
1,8844,Jumanji,Roll the dice and unleash the excitement!,1995-12-15,Adventure|Fantasy|Family,,en,65.0,262.797249,TriStar Pictures|Teitler Film|Interscope Commu...,...,6.9,17.015539,104.0,When siblings Judy and Peter discover an encha...,English|Français,<img src='http://image.tmdb.org/t/p/w185//vgpX...,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,26,16,Joe Johnston
3,31357,Waiting to Exhale,Friends are the people who let you be yourself...,1995-12-22,Comedy|Drama|Romance,,en,16.0,81.452156,Twentieth Century Fox Film Corporation,...,6.1,3.859495,127.0,"Cheated on, mistreated and stepped on, the wom...",English,<img src='http://image.tmdb.org/t/p/w185//4wjG...,Whitney Houston|Angela Bassett|Loretta Devine|...,10,10,Forest Whitaker


## The best and the worst movies

In [14]:
# udf to get top/lowest entries in a column

def high_low(col, n, dataframe = data, bottom=False, condition=None, column_name=None):

  df = dataframe.copy()

  if not isinstance(col,str):

    if column_name is None:
      
      return 'Need column name'

    col.name = column_name
    df[col.name] = col
    col = col.name

  if condition:
    df = df.query(condition)
  
  return df.sort_values(ascending=bottom,by=col).head(n).loc[:,['title',col]]

__Movies Top 5 - Highest Revenue__

In [15]:
high_low('revenue_musd',5)

Unnamed: 0,title,revenue_musd
14448,Avatar,2787.965087
26265,Star Wars: The Force Awakens,2068.223624
1620,Titanic,1845.034188
17669,The Avengers,1519.55791
24812,Jurassic World,1513.52881


__Movies Top 5 - Highest Budget__

In [10]:
high_low('budget_musd',5)

Unnamed: 0,title,budget_musd
16986,Pirates of the Caribbean: On Stranger Tides,380.0
11743,Pirates of the Caribbean: At World's End,300.0
26268,Avengers: Age of Ultron,280.0
10985,Superman Returns,270.0
16006,Tangled,260.0


__Movies Top 5 - Highest ROI__

In [6]:
roi = data['revenue_musd']/ data['budget_musd']
roi
high_low(roi,5,condition='budget_musd > 10',column_name= 'roi')

Unnamed: 0,title,roi
1055,E.T. the Extra-Terrestrial,75.520507
255,Star Wars,70.490728
588,Pretty Woman,33.071429
18300,The Intouchables,32.806221
1144,The Empire Strikes Back,29.911111


__Movies Top 5 - Lowest Rating__

In [16]:
high_low('vote_average',5,bottom=True,condition='vote_count > 10')

Unnamed: 0,title,vote_average
25451,Foodfight!,2.3
6665,House of the Dead,2.8
8484,Eegah,2.8
3439,Battlefield Earth,3.0
6281,From Justin to Kelly,3.0


__Movies Top 5 - Most Popular__

In [17]:
high_low('popularity',5)

Unnamed: 0,title,popularity
30330,Minions,547.488298
32927,Wonder Woman,294.337037
41556,Beauty and the Beast,287.253654
42940,Baby Driver,228.032744
24187,Big Hero 6,213.849907


## Find your next Movie

#### Wrangle to useful format

In [18]:
data2 = data.copy()
columns = ['genres','cast','production_companies']

for col in columns:

  data2[col] = data[col].str.split('|')
  data2.dropna(subset=[col],inplace=True)


In [26]:
data2[:1]

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,crew_size,director
0,862,Toy Story,,1995-10-30,"[Animation, Comedy, Family]",Toy Story Collection,en,30.0,373.554033,[Pixar Animation Studios],...,7.7,21.946943,81.0,"Led by Woody, Andy's toys live happily in his ...",English,<img src='http://image.tmdb.org/t/p/w185//uXDf...,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",13,106,John Lasseter


__Search 1: Science Fiction Action Movie with Bruce Willis (sorted from high to low Rating)__

In [None]:
filter1 = lambda x: True if (('Science Fiction' in x) & ('Action' in x))  else False
cond1 = data2.genres.apply(filter)

filter2 = lambda x: True if 'Bruce Willis' in x else False
cond2 = data2.cast.apply(filter2)

data2[cond1 & cond2].sort_values('vote_average',ascending=False).head(2)

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,production_countries,vote_count,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,crew_size,director
1448,18,The Fifth Element,There is no future without it.,1997-05-07,"[Adventure, Fantasy, Action, Thriller, Science...",,en,90.0,263.92018,"[Columbia Pictures, Gaumont]",France,3962.0,7.3,24.30526,126.0,"In 2257, a taxi driver is unintentionally give...",English|svenska|Deutsch,<img src='http://image.tmdb.org/t/p/w185//fPtl...,"[Bruce Willis, Gary Oldman, Ian Holm, Milla Jo...",114,134,Luc Besson
19218,59967,Looper,"Hunted By Your Future, Haunted By Your Past",2012-09-26,"[Action, Thriller, Science Fiction]",,en,30.0,47.042,"[Endgame Entertainment, FilmDistrict, DMG Ente...",China|United States of America,4777.0,6.6,12.727269,118.0,"In the futuristic action thriller Looper, time...",English,<img src='http://image.tmdb.org/t/p/w185//sNjL...,"[Joseph Gordon-Levitt, Bruce Willis, Emily Blu...",34,42,Rian Johnson


__Search 2: Movies with Uma Thurman and directed by Quentin Tarantino (sorted from short to long runtime)__

In [29]:
cond1 = data['director'] == 'Quentin Tarantino'

cond2 = data['cast'].str.contains('Uma Thurman')

data2[cond1 & cond2].sort_values('vote_average',ascending=False).head(2)

  data2[cond1 & cond2].sort_values('vote_average',ascending=False).head(2)


Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,crew_size,director
291,680,Pulp Fiction,Just because you are a character doesn't mean ...,1994-09-10,"[Thriller, Crime]",,en,8.0,213.928762,"[Miramax Films, A Band Apart, Jersey Films]",...,8.3,140.950236,154.0,"A burger-loving hit man, his philosophical par...",English|Español|Français,<img src='http://image.tmdb.org/t/p/w185//d5iI...,"[John Travolta, Samuel L. Jackson, Uma Thurman...",54,87,Quentin Tarantino
6667,24,Kill Bill: Vol. 1,Go for the kill.,2003-10-10,"[Action, Crime]",Kill Bill Collection,en,30.0,180.949,"[Miramax Films, A Band Apart, Super Cool ManChu]",...,7.7,25.261865,111.0,An assassin is shot at the altar by her ruthle...,English|日本語|Français,<img src='http://image.tmdb.org/t/p/w185//v7Ta...,"[Uma Thurman, Lucy Liu, Vivica A. Fox, Daryl H...",36,161,Quentin Tarantino


__Search 3: Most Successful Pixar Studio Movies between 2010 and 2015 (sorted from high to low Revenue)__

In [None]:
cond1 = data.production_companies.str.contains('Pixar').fillna(False)
cond2 = data.release_date.between('2010-01-01','2015-01-01')

data2.loc[cond1 & cond2]

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,production_countries,vote_count,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,crew_size,director
15236,10193,Toy Story 3,No toy gets left behind.,2010-06-16,"[Animation, Family, Comedy]",Toy Story Collection,en,200.0,1066.969703,"[Walt Disney Pictures, Pixar Animation Studios]",United States of America,4710.0,7.6,16.96647,103.0,"Woody, Buzz, and the rest of Andy's toys haven...",English|Español,<img src='http://image.tmdb.org/t/p/w185//amY0...,"[Tom Hanks, Tim Allen, Ned Beatty, Joan Cusack...",45,38,Lee Unkrich
17220,49013,Cars 2,Ka-ciao!,2011-06-11,"[Animation, Family, Adventure, Comedy]",Cars Collection,en,200.0,559.852396,"[Walt Disney Pictures, Pixar Animation Studios]",United States of America,2088.0,5.8,13.693002,106.0,Star race car Lightning McQueen and his pal Ma...,English|日本語|Italiano|Français,<img src='http://image.tmdb.org/t/p/w185//okIz...,"[Owen Wilson, Larry the Cable Guy, Michael Cai...",47,40,John Lasseter
18900,62177,Brave,Change your fate.,2012-06-21,"[Animation, Adventure, Comedy, Family, Action,...",,en,185.0,538.983207,"[Walt Disney Pictures, Pixar Animation Studios]",United States of America,4760.0,6.7,15.876341,93.0,Brave is set in the mystical Scottish Highland...,English,<img src='http://image.tmdb.org/t/p/w185//8l0p...,"[Kelly Macdonald, Billy Connolly, Emma Thompso...",15,44,Brenda Chapman
20888,62211,Monsters University,School never looked this scary.,2013-06-20,"[Animation, Family]","Monsters, Inc. Collection",en,200.0,743.559607,"[Walt Disney Pictures, Pixar Animation Studios]",United States of America,3622.0,7.0,16.267502,104.0,A look at the relationship between Mike and Su...,English,<img src='http://image.tmdb.org/t/p/w185//tyHH...,"[Billy Crystal, John Goodman, Steve Buscemi, H...",24,13,Dan Scanlon


__Search 4: Action or Thriller Movie with original language English and minimum Rating of 7.5 (most recent movies first)__

In [None]:
cond1 = data.genres.str.contains('Action') | data.genres.str.contains('Thriller')
cond2 = data.original_language == 'en'

data2.loc[cond1 & cond2].loc[data2.vote_average > 7.5].head(2)

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,production_countries,vote_count,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,crew_size,director
5,949,Heat,A Los Angeles Crime Saga,1995-12-15,Action|Crime|Drama|Thriller,,en,60.0,187.436818,Regency Enterprises|Forward Pass|Warner Bros.,United States of America,1886.0,7.7,17.924927,170.0,"Obsessive master thief, Neil McCauley leads a ...",English|Español,<img src='http://image.tmdb.org/t/p/w185//lbf2...,Al Pacino|Robert De Niro|Val Kilmer|Jon Voight...,65,71,Michael Mann
46,807,Se7en,Seven deadly sins. Seven ways to die.,1995-09-22,Crime|Mystery|Thriller,,en,33.0,327.311859,New Line Cinema|Juno Pix|Cecchi Gori Pictures,United States of America,5915.0,8.1,18.45743,127.0,Two homicide detectives are on a desperate hun...,English,<img src='http://image.tmdb.org/t/p/w185//GQP6...,Brad Pitt|Morgan Freeman|Gwyneth Paltrow|John ...,49,91,David Fincher


## Are Franchises more successful?

In [30]:
new_data = data.copy()

# Add Franchise and ROI columns 
new_data['franchise'] = new_data['belongs_to_collection'].isna().apply(lambda x: 'Franchise' if x == False else 'Not')
new_data['ROI'] = new_data.budget_musd.div(new_data.revenue_musd)

__Franchise vs. Stand-alone: Average Revenue__

In [None]:
new_data.groupby(['franchise'])['revenue_musd'].mean()

franchise
Franchise    195.700276
Not           59.457777
Name: revenue_musd, dtype: float64

__Franchise vs. Stand-alone: Multiple Metrics__

In [None]:
agg_dict = {'ROI':pd.Series.median, 
       'budget_musd':pd.Series.mean,
       'popularity': pd.Series.mean,
       'vote_average':pd.Series.mean
       }

new_data.groupby('franchise').agg(agg_dict)

Unnamed: 0_level_0,ROI,budget_musd,popularity,vote_average
franchise,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Franchise,0.2696,46.821294,13.759869,6.201555
Not,0.617399,26.507128,8.614566,6.305118


## Most Successful Franchises

In [33]:
franchises = new_data.groupby("belongs_to_collection").agg({"title":"count", "budget_musd": ["sum", "mean"], 
                                                      "revenue_musd": ["sum", "mean"],
                                                      "vote_average": "mean", "popularity": "mean",
                                                      "ROI":"median", 
                                                      "vote_count":"mean"})

franchises.index.name = "Franchise"

In [34]:
# Largest average budget for a franchise 

franchises.nlargest(3, ('budget_musd','mean'))

Unnamed: 0_level_0,title,budget_musd,budget_musd,revenue_musd,revenue_musd,vote_average,popularity,ROI,vote_count
Unnamed: 0_level_1,count,sum,mean,sum,mean,mean,mean,median,mean
Franchise,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Tangled Collection,1,260.0,260.0,591.794936,591.794936,7.4,14.684761,0.439341,3419.0
Pirates of the Caribbean Collection,5,1250.0,250.0,4521.576826,904.315365,6.88,53.972237,0.289603,5016.0
The Avengers Collection,2,500.0,250.0,2924.961604,1462.480802,7.35,63.633534,0.172005,9454.0


In [None]:
franchises[franchises[('budget_musd','mean')] > 250]

Unnamed: 0_level_0,title,budget_musd,budget_musd,revenue_musd,revenue_musd,vote_average,popularity,ROI,vote_count
Unnamed: 0_level_1,count,sum,mean,sum,mean,mean,mean,median,mean
belongs_to_collection,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Tangled Collection,1,260.0,260.0,591.794936,591.794936,7.4,14.684761,0.439341,3419.0


## Most Successful Directors

In [36]:
director_facts = new_data.groupby('director').agg({'title':'count','revenue_musd':'sum','vote_average':'mean'})

In [37]:
cols = director_facts.columns
summary_df = pd.DataFrame()
for stat in cols:
  temp = director_facts.nlargest(3,stat).reset_index()
  summary_df[stat + '_rankings'] = temp[['director', stat]].apply(tuple, axis=1)

In [38]:
summary_df

Unnamed: 0,title_rankings,revenue_musd_rankings,vote_average_rankings
0,"(Steven Spielberg, 30)","(Steven Spielberg, 9256.621422)","(Aditya Chopra, 9.1)"
1,"(Clint Eastwood, 27)","(Peter Jackson, 6528.244659)","(Alain Fresnot, 9.0)"
2,"(Woody Allen, 24)","(Michael Bay, 6437.466781)","(Subodh Bhave, 9.0)"
