## Give Me The Next AAA Title
# Most Popular Actors (2011 - present)
****

<br>
by Dustin Reyes
<br>
<br>
Prepared for:
<br>
Mynt (Globe Fintech Innovations, Inc.)
<br>
<br>

In [1]:
import imdb
import pickle
import numpy as np
import pandas as pd
import warnings
import datetime as dt
from tqdm import tqdm
from ast import literal_eval
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact
from random import randint
from time import sleep
from pytrends.request import TrendReq
warnings.filterwarnings('ignore')
tqdm.pandas()
%matplotlib inline

In [2]:
df = pd.read_csv('data/df_processed_total1.csv')

In [3]:
# df_names = pd.read_csv('data/name.basics.tsv.gz', compression='gzip', header=0, sep='\t', quotechar='"')

In [4]:
df.head()

Unnamed: 0,tconst,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,title,release,director,...,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,leadActor
0,tt0369610,nm0415425,Rick Jaffa,1956,\N,writer,"tt1318514,tt0369610,tt4566758,tt2103281",Jurassic World,2015-06-12,Colin Trevorrow,...,movie,Jurassic World,Jurassic World,0,2015,124,Action,7.0,584990,nm0415425
1,tt0369610,nm0000341,Michael Crichton,1942,2008,writer,"tt0117998,tt0107290,tt0070909,tt0108757",Jurassic World,2015-06-12,Colin Trevorrow,...,movie,Jurassic World,Jurassic World,0,2015,124,Action,7.0,584990,nm0415425
2,tt0369610,nm2081046,Derek Connolly,\N,\N,writer,"tt1862079,tt5884052,tt0369610,tt3731562",Jurassic World,2015-06-12,Colin Trevorrow,...,movie,Jurassic World,Jurassic World,0,2015,124,Action,7.0,584990,nm0415425
3,tt0369610,nm0798646,Amanda Silver,1963,\N,writer,"tt0369610,tt1318514,tt4566758,tt2103281",Jurassic World,2015-06-12,Colin Trevorrow,...,movie,Jurassic World,Jurassic World,0,2015,124,Action,7.0,584990,nm0415425
4,tt0369610,nm0339460,Judy Greer,1975,\N,actress,"tt0369610,tt0988595,tt0337563,tt0478970",Jurassic World,2015-06-12,Colin Trevorrow,...,movie,Jurassic World,Jurassic World,0,2015,124,Action,7.0,584990,nm0415425


In [5]:
df.columns

Index(['tconst', 'nconst', 'primaryName', 'birthYear', 'deathYear',
       'primaryProfession', 'knownForTitles', 'title', 'release', 'director',
       'budget', 'opening', 'gross', 'worldwide_gross', 'metacritic_score',
       'mpaa_rating', 'budget_mil', 'opening_mil', 'titleType', 'primaryTitle',
       'originalTitle', 'isAdult', 'startYear', 'runtimeMinutes', 'genres',
       'averageRating', 'numVotes', 'leadActor'],
      dtype='object')

In [6]:
df_movies = df[['title', 'release', 'startYear']].drop_duplicates().sort_values('release')
df_movies.reset_index(drop=True, inplace = True)

In [7]:
df_movies.head()

Unnamed: 0,title,release,startYear
0,Campus Radio,2011-01-01,2011
1,Season of the Witch,2011-01-07,2011
2,The Green Hornet,2011-01-14,2011
3,The Mechanic,2011-01-28,2011
4,The Rite,2011-01-28,2011


In [8]:
def get_top_three_actors(title):
    ia = imdb.IMDb()
    actors = []
    try:
        search_results = ia.search_movie(title)
        if search_results:
            movieID = search_results[0].movieID
            movie = ia.get_movie(movieID)
            if movie:
                cast = movie.get('cast')
                topActors = 3
                for actor in cast[:topActors]:
                    actors.append(actor['name'])
        return actors
    except:
        actors.append('')
        return actors

## Top Three Actors of a Movie
Getting the top three actors of each movie that we have

In [9]:
# Takes a long time!
# Uncomment to run
# df_movies['top3_actors'] = df_movies['title'].progress_apply(get_top_three_actors)

In [10]:
# We save it into a dataframe for faster access
# df_movies.to_csv('data/lead_actors.csv', index = False)

In [11]:
df_movies2 = pd.read_csv('data/lead_actors.csv')

In [12]:
df_movies2.drop(['release','startYear'], axis =1, inplace = True)

In [13]:
df_movies3 = pd.read_csv('data/titles_complete_info.csv')
# df_movies3.head()

In [14]:
df_movies3 = df_movies3.merge(df_movies2, on='title').sort_values(
    by='release').reset_index(drop=True)

In [15]:
df_movies3.dropna(inplace = True)

In [16]:
df_movies3.head(3)

Unnamed: 0,title,release,director,budget,opening,gross,worldwide_gross,metacritic_score,mpaa_rating,budget_mil,...,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,leadActor,top3_actors
1,Season of the Witch,2011-01-07,Dominic Sena,40000000,10612375,24827228,91627228.0,28.0,PG-13,40.0,...,Season of the Witch,Season of the Witch,0,2011,95,Action,5.4,90902,nm0651414,"['Robert De Niro', 'Harvey Keitel', 'David Pro..."
2,The Green Hornet,2011-01-14,Michel Gondry,120000000,33526876,98780042,227817248.0,39.0,PG-13,120.0,...,The Green Hornet,The Green Hornet,0,2011,119,Action,5.8,155886,nm0006133,"['Seth Rogen', 'Jay Chou', 'Cameron Diaz']"
3,The Mechanic,2011-01-28,Simon West,40000000,11422006,29121498,76130093.0,49.0,R,40.0,...,The Mechanic,The Mechanic,0,2011,93,Action,6.6,152076,nm0153587,"['Jason Statham', 'Ben Foster', 'Tony Goldwyn']"


In [17]:
df_movies3.columns

Index(['title', 'release', 'director', 'budget', 'opening', 'gross',
       'worldwide_gross', 'metacritic_score', 'mpaa_rating', 'budget_mil',
       'opening_mil', 'tconst', 'titleType', 'primaryTitle', 'originalTitle',
       'isAdult', 'startYear', 'runtimeMinutes', 'genres', 'averageRating',
       'numVotes', 'leadActor', 'top3_actors'],
      dtype='object')

In [18]:
df_movies3['top3_actors'] = df_movies3['top3_actors'].apply(lambda x: literal_eval(x))

In [19]:
col_names = ['title', 'top3_actors', 'gross', 'release', 'startYear']
expanded_data = []
for idx, row in df_movies3[col_names].iterrows():
    for name in row['top3_actors']:
        expanded_data.append(
            [row['title'], name.strip(), row['gross'], row['release'], row['startYear']])
df_movie_expanded = pd.DataFrame(expanded_data, columns=[
                             'title', 'actors', 'gross', 'release', 'startYear'])
df_movie_expanded.head()

Unnamed: 0,title,actors,gross,release,startYear
0,Season of the Witch,Robert De Niro,24827228,2011-01-07,2011
1,Season of the Witch,Harvey Keitel,24827228,2011-01-07,2011
2,Season of the Witch,David Proval,24827228,2011-01-07,2011
3,The Green Hornet,Seth Rogen,98780042,2011-01-14,2011
4,The Green Hornet,Jay Chou,98780042,2011-01-14,2011


In [20]:
# df_movie_expanded[df_movie_expanded['title'].str.lower().str.contains('avengers')]

In [21]:
df_movie_actorgross = df_movie_expanded.groupby(['startYear', 'actors'])[
    'gross'].sum().reset_index()
df_movie_actorgross = df_movie_actorgross[df_movie_actorgross['actors'] != '']
df_movie_actorgross = df_movie_actorgross[df_movie_actorgross['actors'] != 'Cole Konis']

In [22]:
df_movie_actorgross = df_movie_actorgross.groupby(['startYear']).apply(lambda x: x.nlargest(10,['gross'])).reset_index(drop=True)

In [23]:
df_movie_actorgross.head()

Unnamed: 0,startYear,actors,gross
0,2011,Alan Rickman,381409310
1,2011,Michael Gambon,381409310
2,2011,Ralph Fiennes,381409310
3,2011,Johnny Depp,364549409
4,2011,Josh Duhamel,352390543


In [24]:
def filter_data(year=2011):
    test_df = df_movie_actorgross[df_movie_actorgross['startYear'] == year]
    sns.barplot(x="gross", y="actors", data=test_df)
    plt.show()

In [25]:
years = []

for i in df_movie_actorgross['startYear'].unique():
    years.append(i)

In [26]:
interactive = interact(filter_data, year = years)

interactive(children=(Dropdown(description='year', options=(2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 20…

## Getting Google Trends Data

In [27]:
df_movie_actorgross.head()

Unnamed: 0,startYear,actors,gross
0,2011,Alan Rickman,381409310
1,2011,Michael Gambon,381409310
2,2011,Ralph Fiennes,381409310
3,2011,Johnny Depp,364549409
4,2011,Josh Duhamel,352390543


In [28]:
df_movie_actorgross2 = df_movie_actorgross[df_movie_actorgross['startYear'] != 2021]

In [29]:
df_movie_actorgross2['startYear'] = df_movie_actorgross2['startYear'].astype(str)

In [30]:
df_movie_actorgross2['Year1'] = df_movie_actorgross2['startYear'] + '-' + '01' + '-' + '01'

In [31]:
df_movie_actorgross2['startYear'] = df_movie_actorgross2['startYear'].astype(int)
df_movie_actorgross2['nextYear'] = df_movie_actorgross2['startYear'] + 1
df_movie_actorgross2['nextYear'] = df_movie_actorgross2['nextYear'].astype(str)
df_movie_actorgross2['Year2'] = df_movie_actorgross2['nextYear'] + '-' + '01' + '-' + '01'

In [32]:
pytrend = TrendReq(hl='en-US', tz=360)

In [33]:
list_tups = [(2011, 2012), (2012, 2013), (2013, 2014), (2014, 2015),
             (2015, 2016), (2016, 2017),
             (2017, 2018), (2018, 2019), (2019, 2020)]


def get_trends(year_lst, df):
    dataframes = []
    for x in year_lst:
        print('Processing from: ', str(x[0]) + ' to ' + str(x[1]))
        test_df = df[(df['startYear'] >= x[0]) & (df['startYear'] < x[1])]
        actors = test_df['actors'].values.tolist()
        dataset = []
        for x in range(0, len(actors)):
            keywords = [actors[x]]
            pytrend.build_payload(
                kw_list=keywords,
                cat=0,
                timeframe=test_df['Year1'].unique(
                )[0] + " " + test_df['Year2'].unique()[0],
                geo='US')
            data = pytrend.interest_over_time()
            if not data.empty:
                data = data.drop(labels=['isPartial'], axis='columns')
                dataset.append(data)
        result = pd.concat(dataset, axis=1)
        result.reset_index(inplace=True)
        result['year'] = result['date'].dt.year
        name_cols = result.columns.tolist()[1:-1]
#         print(name_cols)
        test_df['search_interests'] = result[name_cols].sum().values
        test_df.drop(['Year1', 'nextYear', 'Year2'], axis=1, inplace=True)

        dataframes.append(test_df)

    final = pd.concat(dataframes, ignore_index=True)
    return final

In [34]:
# df_lists = get_trends(list_tups, df_movie_actorgross2)

In [35]:
# df_lists.to_csv('data/actor_searches.csv', index = False)

In [36]:
df_lists = pd.read_csv('data/actor_searches.csv')

In [37]:
df_lists.head()

Unnamed: 0,startYear,actors,gross,search_interests
0,2011,Alan Rickman,381409310,1272
1,2011,Michael Gambon,381409310,1143
2,2011,Ralph Fiennes,381409310,1119
3,2011,Johnny Depp,364549409,2552
4,2011,Josh Duhamel,352390543,1812


In [38]:
df_lists[df_lists['startYear'] == 2018].sort_values(
    by='search_interests', ascending=False)

Unnamed: 0,startYear,actors,gross,search_interests
70,2018,Chris Hemsworth,724635195,1736
74,2018,Mark Ruffalo,678815482,1523
75,2018,Robert Downey Jr.,678815482,1509
77,2018,Holly Hunter,608581744,1244
76,2018,Craig T. Nelson,608581744,1191
79,2018,Bryce Dallas Howard,417719760,1167
78,2018,Sarah Vowell,608581744,884
73,2018,Michael B. Jordan,700426566,669
72,2018,Lupita Nyong'o,700426566,621
71,2018,Chadwick Boseman,700426566,619


Therefore, the most popular actor in terms of gross earnings is Chris Hemsworth having 1736 search interest totals

### Most Popular Actor of the Film with the Best Opening Performance

In [39]:
def get_top_15_actors(title):
    ia = imdb.IMDb()
    actors = []
    try:
        search_results = ia.search_movie(title)
        if search_results:
            movieID = search_results[0].movieID
            movie = ia.get_movie(movieID)
            if movie:
                cast = movie.get('cast')
                topActors = 15
                for actor in cast[:topActors]:
                    actors.append(actor['name'])
        return actors
    except:
        actors.append('')
        return actors

In [40]:
def get_trends_specific(list_names, year):
    df = pd.DataFrame()
#     df2 = pd.DataFrame()
    df['actors'] = list_names
    df['year'] = year
    df['year'] = df['year'].astype(str)
    df['Year1'] = df['year'] + '-' + '01' + '-' + '01'
    df['year'] = df['year'].astype(int)
    df['next_year'] = df['year'] + 1
    df['next_year'] = df['next_year'].astype(str)
    df['Year2'] = df['next_year'] + '-' + '01' + '-' + '01'

    dataset = []
    for x in range(0, len(list_names)):
        keywords = [list_names[x]]
        pytrend.build_payload(
            kw_list=keywords,
            cat=0,
            timeframe=df['Year1'].unique()[0] + " " + df['Year2'].unique()[0],
            geo='US')
        data = pytrend.interest_over_time()
        if not data.empty:
            data = data.drop(labels=['isPartial'], axis='columns')
            dataset.append(data)
    result = pd.concat(dataset, axis=1)
    result.reset_index(inplace=True)
    result['year'] = result['date'].dt.year
    name_cols = result.columns.tolist()[1:-1]
    result.drop('date', axis=1, inplace=True)
    df2 = pd.DataFrame(result[name_cols].sum())
    df2.columns = ['search_interest']
    df2['year'] = year
    return df2

In [41]:
def movie_analyze(df, year):
    test_df = df[df['startYear'] == year].sort_values(by = 'opening', ascending = False).head()
    best_opening = test_df['title'].values.tolist()[0]
    print(best_opening)
    top_actors = get_top_15_actors(best_opening)
    df_interest= get_trends_specific(top_actors, year)
    df_interest.sort_values(by = 'search_interest', ascending = False, inplace = True)
    return year, best_opening, df_interest

In [42]:
# year, movie_title, df_analysis = movie_analyze(df_movies3, year = 2019)

In [43]:
def top_search_visualizer(year_choice):
    year, movie_title, df_analysis = movie_analyze(
        df_movies3, year=year_choice)
    fig_dims = (15, 10)
    fig, ax = plt.subplots(figsize=fig_dims)
    sns.barplot(y=df_analysis.index, x="search_interest",
                ax=ax, data=df_analysis)
    ax.set(xlabel='Search Interests', ylabel='Most Searched Actors',
           title=f"{year}'s Most Searched Actors for the Film with the Best Opening Performance: {movie_title}")
    plt.show()

In [44]:
interactive2 = interact(top_search_visualizer, year_choice = years)

interactive(children=(Dropdown(description='year_choice', options=(2011, 2012, 2013, 2014, 2015, 2016, 2017, 2…

For 2018, the movie with the best opening performance is the highly anticipated **Avengers: Endgame**. In line with this, I pulled relevant information on the casts of Avengers: Endgame in order as I know that the actors of this movie were also some of the most popular actors in terms of search interests for that year. 

The previous analysis only analyzes the top 3 performing actors of each movie for a year but it seems that outside of that list, other actors were also famous during that year. This is the main purpose of this analysis wherein I pulled the top 15 actors of the highest opening movie. For 2018's Avengers: Endgame, we can observe that it was **Scarlett Johansson** which had the most search interests. Reasons for this include to what happened on her character in the movie or even hollywood showbusiness. 

In [47]:
# df_movies3.to_csv('data2/data_imdb_complete.csv', index = False)