## Give Me The Next AAA Title
# Predictive Modeling
### Movie Profitability Prediction Based on Popularity
****

<br>
by Dustin Reyes
<br>
<br>
Prepared for:
<br>
Mynt (Globe Fintech Innovations, Inc.)
<br>
<br>

In [1]:
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
from ast import literal_eval
import matplotlib
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, make_scorer, accuracy_score
from collections import Counter
from xgboost import XGBClassifier, plot_importance
from pytrends.request import TrendReq

import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier

import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')
tqdm.pandas()
%matplotlib inline

In [2]:
df = pd.read_csv('data2/data_imdb_complete.csv')
df.head()

Unnamed: 0,title,release,director,budget,opening,gross,worldwide_gross,metacritic_score,mpaa_rating,budget_mil,...,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,leadActor,top3_actors
0,Season of the Witch,2011-01-07,Dominic Sena,40000000,10612375,24827228,91627228.0,28.0,PG-13,40.0,...,Season of the Witch,Season of the Witch,0,2011,95,Action,5.4,90902,nm0651414,"['Robert De Niro', 'Harvey Keitel', 'David Pro..."
1,The Green Hornet,2011-01-14,Michel Gondry,120000000,33526876,98780042,227817248.0,39.0,PG-13,120.0,...,The Green Hornet,The Green Hornet,0,2011,119,Action,5.8,155886,nm0006133,"['Seth Rogen', 'Jay Chou', 'Cameron Diaz']"
2,The Mechanic,2011-01-28,Simon West,40000000,11422006,29121498,76130093.0,49.0,R,40.0,...,The Mechanic,The Mechanic,0,2011,93,Action,6.6,152076,nm0153587,"['Jason Statham', 'Ben Foster', 'Tony Goldwyn']"
3,The Rite,2011-01-28,Mikael Håfström,37000000,14789393,33047633,96560591.0,38.0,PG-13,37.0,...,The Rite,The Rite,0,2011,114,Drama,6.0,91850,nm1246087,"['Anthony Hopkins', ""Colin O'Donoghue"", 'Alice..."
4,Sanctum,2011-02-04,Alister Grierson,30000000,9447930,23209310,108609310.0,42.0,R,30.0,...,Sanctum,Sanctum,0,2011,108,Action,5.9,53868,nm1718906,"['Richard Roxburgh', 'Ioan Gruffudd', 'Rhys Wa..."


In [3]:
df.drop(['originalTitle', 'primaryTitle', 'isAdult',
         'leadActor', 'tconst', 'director', 'budget_mil', 'opening_mil', 'titleType'], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,title,release,budget,opening,gross,worldwide_gross,metacritic_score,mpaa_rating,startYear,runtimeMinutes,genres,averageRating,numVotes,top3_actors
0,Season of the Witch,2011-01-07,40000000,10612375,24827228,91627228.0,28.0,PG-13,2011,95,Action,5.4,90902,"['Robert De Niro', 'Harvey Keitel', 'David Pro..."
1,The Green Hornet,2011-01-14,120000000,33526876,98780042,227817248.0,39.0,PG-13,2011,119,Action,5.8,155886,"['Seth Rogen', 'Jay Chou', 'Cameron Diaz']"
2,The Mechanic,2011-01-28,40000000,11422006,29121498,76130093.0,49.0,R,2011,93,Action,6.6,152076,"['Jason Statham', 'Ben Foster', 'Tony Goldwyn']"
3,The Rite,2011-01-28,37000000,14789393,33047633,96560591.0,38.0,PG-13,2011,114,Drama,6.0,91850,"['Anthony Hopkins', ""Colin O'Donoghue"", 'Alice..."
4,Sanctum,2011-02-04,30000000,9447930,23209310,108609310.0,42.0,R,2011,108,Action,5.9,53868,"['Richard Roxburgh', 'Ioan Gruffudd', 'Rhys Wa..."


In [5]:
df.columns

Index(['title', 'release', 'budget', 'opening', 'gross', 'worldwide_gross',
       'metacritic_score', 'mpaa_rating', 'startYear', 'runtimeMinutes',
       'genres', 'averageRating', 'numVotes', 'top3_actors'],
      dtype='object')

In [6]:
df['previousYear'] = df['startYear'] - 1

In [7]:
df2 = pd.read_csv('data2/movie_popularity2.csv')

In [8]:
df3 = df.merge(df2, on = 'title')
df3.drop('year', axis = 1, inplace = True)

In [9]:
df3.head()

Unnamed: 0,title,release,budget,opening,gross,worldwide_gross,metacritic_score,mpaa_rating,startYear,runtimeMinutes,genres,averageRating,numVotes,top3_actors,previousYear,search_interest
0,Season of the Witch,2011-01-07,40000000,10612375,24827228,91627228.0,28.0,PG-13,2011,95,Action,5.4,90902,"['Robert De Niro', 'Harvey Keitel', 'David Pro...",2010,943
1,The Green Hornet,2011-01-14,120000000,33526876,98780042,227817248.0,39.0,PG-13,2011,119,Action,5.8,155886,"['Seth Rogen', 'Jay Chou', 'Cameron Diaz']",2010,694
2,The Mechanic,2011-01-28,40000000,11422006,29121498,76130093.0,49.0,R,2011,93,Action,6.6,152076,"['Jason Statham', 'Ben Foster', 'Tony Goldwyn']",2010,1260
3,The Rite,2011-01-28,37000000,14789393,33047633,96560591.0,38.0,PG-13,2011,114,Drama,6.0,91850,"['Anthony Hopkins', ""Colin O'Donoghue"", 'Alice...",2010,1036
4,Sanctum,2011-02-04,30000000,9447930,23209310,108609310.0,42.0,R,2011,108,Action,5.9,53868,"['Richard Roxburgh', 'Ioan Gruffudd', 'Rhys Wa...",2010,1038


### Getting the Movie's Actors Average Search Interests (Top 3 Actors)

In [10]:
pytrend = TrendReq(hl='en-US', tz=360)

In [11]:
test = ['Robert De Niro', 'Harvey Keitel', 'David Proval']

In [12]:
def get_trends_specific(title, list_names, year):
    df = pd.DataFrame()
    df3 = pd.DataFrame()
    
    df['actor'] = list_names
    df['year'] = year
    df['year'] = df['year'].astype(str)
    df['Year1'] = df['year'] + '-' + '01' + '-' + '01'
    df['year'] = df['year'].astype(int)
    df['next_year'] = df['year'] + 1
    df['next_year'] = df['next_year'].astype(str)
    df['Year2'] = df['next_year'] + '-' + '01' + '-' + '01'

    dataset = []
    for x in range(0, len(list_names)):
        keywords = [list_names[x]]
        pytrend.build_payload(
            kw_list=keywords,
            cat=0,
            timeframe=df['Year1'].unique()[0] + " " + df['Year2'].unique()[0],
            geo='US')
        data = pytrend.interest_over_time()
        if not data.empty:
            data = data.drop(labels=['isPartial'], axis='columns')
            dataset.append(data)
    result = pd.concat(dataset, axis=1)
#     return result
    result.reset_index(inplace=True)
    result['year'] = result['date'].dt.year
    name_cols = result.columns.tolist()[1:-1]
    result.drop('date', axis=1, inplace=True)
    df2 = pd.DataFrame(result[name_cols].sum())
    df2.columns = ['search_interest']
    df2['year'] = year
    df2.reset_index(inplace = True)
    df2.rename({'index': 'actors'}, axis = 1, inplace =True)
    df2['title'] = title
    
    mean = df2['search_interest'].mean()
    result_list = [[title , mean]]
    return result_list

In [14]:
# data = []
# for i in tqdm(range(len(df))):
#     try:
#         title = df.iloc[i]['title']
#         listnames = literal_eval(df.iloc[i]['top3_actors'])
#         year = df.iloc[i]['startYear']
#         test_df = get_trends_specific(title, listnames, year)
#         df_actor_interests = pd.DataFrame(test_df, columns = ['title', 'ave_actorsearch'])
#         data.append(df_actor_interests)
#     except:
#         print('Error on title:', title)
#         continue

In [None]:
df_actor_interests_final = pd.concat(data, axis=0)
df_actor_interests_final