## Give Me The Next AAA Title
# Predictive Modeling
### Movie Profitability Prediction Based on Popularity
****

<br>
by Dustin Reyes
<br>
<br>
Prepared for:
<br>
Mynt (Globe Fintech Innovations, Inc.)
<br>
<br>

In [1]:
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, make_scorer, accuracy_score
from collections import Counter
from xgboost import XGBClassifier, plot_importance
from pytrends.request import TrendReq

import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier

import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')
tqdm.pandas()
%matplotlib inline

In [2]:
df = pd.read_csv('data2/data_imdb_complete.csv')
df.head()

Unnamed: 0,title,release,director,budget,opening,gross,worldwide_gross,metacritic_score,mpaa_rating,budget_mil,...,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,leadActor,top3_actors
0,Season of the Witch,2011-01-07,Dominic Sena,40000000,10612375,24827228,91627228.0,28.0,PG-13,40.0,...,Season of the Witch,Season of the Witch,0,2011,95,Action,5.4,90902,nm0651414,"['Robert De Niro', 'Harvey Keitel', 'David Pro..."
1,The Green Hornet,2011-01-14,Michel Gondry,120000000,33526876,98780042,227817248.0,39.0,PG-13,120.0,...,The Green Hornet,The Green Hornet,0,2011,119,Action,5.8,155886,nm0006133,"['Seth Rogen', 'Jay Chou', 'Cameron Diaz']"
2,The Mechanic,2011-01-28,Simon West,40000000,11422006,29121498,76130093.0,49.0,R,40.0,...,The Mechanic,The Mechanic,0,2011,93,Action,6.6,152076,nm0153587,"['Jason Statham', 'Ben Foster', 'Tony Goldwyn']"
3,The Rite,2011-01-28,Mikael Håfström,37000000,14789393,33047633,96560591.0,38.0,PG-13,37.0,...,The Rite,The Rite,0,2011,114,Drama,6.0,91850,nm1246087,"['Anthony Hopkins', ""Colin O'Donoghue"", 'Alice..."
4,Sanctum,2011-02-04,Alister Grierson,30000000,9447930,23209310,108609310.0,42.0,R,30.0,...,Sanctum,Sanctum,0,2011,108,Action,5.9,53868,nm1718906,"['Richard Roxburgh', 'Ioan Gruffudd', 'Rhys Wa..."


In [3]:
df.drop(['originalTitle', 'primaryTitle', 'isAdult',
         'leadActor', 'tconst', 'director', 'budget_mil', 'opening_mil', 'titleType'], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,title,release,budget,opening,gross,worldwide_gross,metacritic_score,mpaa_rating,startYear,runtimeMinutes,genres,averageRating,numVotes,top3_actors
0,Season of the Witch,2011-01-07,40000000,10612375,24827228,91627228.0,28.0,PG-13,2011,95,Action,5.4,90902,"['Robert De Niro', 'Harvey Keitel', 'David Pro..."
1,The Green Hornet,2011-01-14,120000000,33526876,98780042,227817248.0,39.0,PG-13,2011,119,Action,5.8,155886,"['Seth Rogen', 'Jay Chou', 'Cameron Diaz']"
2,The Mechanic,2011-01-28,40000000,11422006,29121498,76130093.0,49.0,R,2011,93,Action,6.6,152076,"['Jason Statham', 'Ben Foster', 'Tony Goldwyn']"
3,The Rite,2011-01-28,37000000,14789393,33047633,96560591.0,38.0,PG-13,2011,114,Drama,6.0,91850,"['Anthony Hopkins', ""Colin O'Donoghue"", 'Alice..."
4,Sanctum,2011-02-04,30000000,9447930,23209310,108609310.0,42.0,R,2011,108,Action,5.9,53868,"['Richard Roxburgh', 'Ioan Gruffudd', 'Rhys Wa..."


In [5]:
df.columns

Index(['title', 'release', 'budget', 'opening', 'gross', 'worldwide_gross',
       'metacritic_score', 'mpaa_rating', 'startYear', 'runtimeMinutes',
       'genres', 'averageRating', 'numVotes', 'top3_actors'],
      dtype='object')

In [6]:
df['previousYear'] = df['startYear'] - 1

### Getting Movie Search Interests (1 Year prior Release)

In [7]:
pytrend = TrendReq(hl='en-US', tz=360)

In [8]:
testlst = ['Season of the Witch', 'The Green Hornet', 'The Mechanic']

In [9]:
def get_trends_specific(list_movies, year):
    df = pd.DataFrame()
#     df2 = pd.DataFrame()
    df['movies'] = list_movies
    df['year'] = year
    df['year'] = df['year'].astype(str)
    df['Year1'] = df['year'] + '-' + '01' + '-' + '01'
    df['year'] = df['year'].astype(int)
    df['next_year'] = df['year'] + 1
    df['next_year'] = df['next_year'].astype(str)
    df['Year2'] = df['next_year'] + '-' + '01' + '-' + '01'

    dataset = []
    for x in range(0, len(list_movies)):
        keywords = [list_movies[x]]
        pytrend.build_payload(
            kw_list=keywords,
            cat=0,
            timeframe=df['Year1'].unique()[0] + " " + df['Year2'].unique()[0],
            geo='US')
        data = pytrend.interest_over_time()
        if not data.empty:
            data = data.drop(labels=['isPartial'], axis='columns')
            dataset.append(data)
    result = pd.concat(dataset, axis=1)
#     return result
    result.reset_index(inplace=True)
    result['year'] = result['date'].dt.year
    name_cols = result.columns.tolist()[1:-1]
    result.drop('date', axis=1, inplace=True)
    df2 = pd.DataFrame(result[name_cols].sum())
    df2.columns = ['search_interest']
    df2['year'] = year
    df2.reset_index(inplace = True)
    df2.rename({'index': 'title'}, axis = 1, inplace =True)
    return df2

In [10]:
movie_list = df['title'].unique().tolist()
# movie_list = ['The Mechanic', 'Big Mommas: Like Father, Like Son']

In [11]:
res = []
for i in tqdm(movie_list):
    try:
        year_item = df[df['title'] == i]['startYear'].values[0]
        title = [i]
        df_test = get_trends_specific(title, year_item)
        res.append(df_test)
    except:
        print('No Data for:', i)
        continue
df_movieseaches = pd.concat(res, axis=0)

  1%|▋                                                                                | 9/1109 [00:08<16:17,  1.13it/s]

No Data for: Big Mommas: Like Father, Like Son


  3%|██▏                                                                             | 30/1109 [00:27<16:16,  1.10it/s]

No Data for: Hoodwinked Too! Hood vs. Evil


  8%|██████▍                                                                         | 89/1109 [01:20<13:07,  1.29it/s]

No Data for: The Twilight Saga: Breaking Dawn - Part 1


 18%|█████████████▉                                                                 | 195/1109 [02:54<12:47,  1.19it/s]

No Data for: Atlas Shrugged II: The Strike


 19%|██████████████▋                                                                | 207/1109 [03:05<17:05,  1.14s/it]

No Data for: The Twilight Saga: Breaking Dawn - Part 2


 34%|██████████████████████████▌                                                    | 373/1109 [05:26<09:04,  1.35it/s]

No Data for: Legends of Oz: Dorothy's Return


 53%|█████████████████████████████████████████▉                                     | 589/1109 [08:24<07:01,  1.23it/s]

No Data for: Star Wars: Episode VII - The Force Awakens


 55%|███████████████████████████████████████████▍                                   | 609/1109 [08:42<07:06,  1.17it/s]

No Data for: Hail, Caesar!


 64%|██████████████████████████████████████████████████▌                            | 709/1109 [10:07<05:53,  1.13it/s]

No Data for: Boo! A Madea Halloween


 74%|██████████████████████████████████████████████████████████▎                    | 818/1109 [11:55<04:38,  1.04it/s]

No Data for: Mother!


 77%|████████████████████████████████████████████████████████████▍                  | 849/1109 [12:26<04:47,  1.11s/it]

No Data for: Star Wars: Episode VIII - The Last Jedi


 78%|█████████████████████████████████████████████████████████████▊                 | 868/1109 [12:45<03:59,  1.00it/s]

No Data for: Bilal: A New Breed of Hero


 83%|█████████████████████████████████████████████████████████████████▍             | 918/1109 [13:34<03:25,  1.08s/it]

No Data for: Mamma Mia! Here We Go Again


 91%|██████████████████████████████████████████████████████████████████████▋       | 1005/1109 [14:58<01:36,  1.07it/s]

No Data for: Shazam!


100%|██████████████████████████████████████████████████████████████████████████████| 1109/1109 [16:32<00:00,  1.12it/s]


In [15]:
df_movieseaches.reset_index(drop = True, inplace = True)

In [16]:
df_movieseaches

Unnamed: 0,title,search_interest,year
0,Season of the Witch,943,2011
1,The Green Hornet,694,2011
2,The Mechanic,1260,2011
3,The Rite,1036,2011
4,Sanctum,1038,2011
...,...,...,...
1090,Freaky,2132,2020
1091,Monster Hunter,1868,2020
1092,Chaos Walking,420,2021
1093,Nobody,716,2021


In [17]:
# df_movieseaches.to_csv('data2/movie_popularity.csv', index = False)

In [24]:
list1 = [['Hoodwinked Too! Hood vs. Evil', 797, 2011],
         ['The Twilight Saga: Breaking Dawn - Part 1', 893, 2011],
         ['Atlas Shrugged: Part II', 572, 2011],
         ['The Twilight Saga: Breaking Dawn - Part 2', 844, 2012],
         ['Star Wars: Episode VII - The Force Awakens', 905, 2015],
         ['Hail, Caesar!', 561, 2016],
         ['Boo! A Madea Halloween', 477, 2016],
         ['Mother!', 503, 2017],
         ['Star Wars: Episode VIII - The Last Jedi', 1392, 2017],
         ['Bilal: A New Breed of Hero', 854, 2015],
         ['Mamma Mia! Here We Go Again', 702, 2018],
         ['Shazam!', 734, 2019]]

In [27]:
a_series = pd.DataFrame(list1, columns = df_movieseaches.columns)

In [29]:
df_movieseaches = df_movieseaches.append(a_series, ignore_index=True)

In [31]:
# df_movieseaches.to_csv('data2/movie_popularity2.csv', index = False)