## Give Me The Next AAA Title
# Predictive Modeling
### Featuring Engineering of Dates
****

<br>
by Dustin Reyes
<br>
<br>
Prepared for:
<br>
Mynt (Globe Fintech Innovations, Inc.)
<br>
<br>

In [1]:
import os
import ast
import pandas as pd
import numpy as np
from datetime import date, datetime
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
df = pd.read_csv('data2/combined_metadata.csv')
df = df.drop(['Unnamed: 0', 'imdbId', 'tmdbId', 'movieId'], axis=1)
df['id'] = df['id'].astype('str')
df.head()

Unnamed: 0,adult,budget,genres,id,original_language,production_countries,revenue,runtime,spoken_languages,views,likes,dislikes,n_trailers
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",8050136,6330,1965,3
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",83048,70,14,1
2,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",104948,0,0,1
3,False,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",949,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",32419,119,1,1
4,False,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",9091,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",6162,12,3,2


In [3]:
df_date = pd.read_csv('data2/movies_metadata.csv')[['id', 'release_date']]
df_date.head()

Unnamed: 0,id,release_date
0,862,1995-10-30
1,8844,1995-12-15
2,15602,1995-12-22
3,31357,1995-12-22
4,11862,1995-02-10


In [4]:
df_combined = pd.merge(df, df_date, on='id')
df_combined.head()

Unnamed: 0,adult,budget,genres,id,original_language,production_countries,revenue,runtime,spoken_languages,views,likes,dislikes,n_trailers,release_date
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",8050136,6330,1965,3,1995-10-30
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",83048,70,14,1,1995-12-15
2,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",104948,0,0,1,1995-12-22
3,False,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",949,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",32419,119,1,1,1995-12-15
4,False,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",9091,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",6162,12,3,2,1995-12-22


In [5]:
# Retrieve all of the release dates in the data and convert format to %Y-%m-%d' datetime
dates = [datetime.strptime(strdate, '%Y-%m-%d') for strdate in df_combined['release_date'] ]
dates_min = min(dates)

In [6]:
daysSinceMin = []
for date in dates:
    daysSinceMin.append((date - dates_min).days)

In [7]:
from datetime import date
dayInYear = []
for mdate in dates:
    dayInYear.append((mdate.date() - date(mdate.year, 1, 1)).days)

In [8]:
df_combined['daysStart'] = daysSinceMin
df_combined['dayInYear'] = dayInYear

In [9]:
df_combined.drop('release_date', axis=1, inplace = True)

In [10]:
# df_combined.to_csv('data2/parsed_dates.csv', index = False)

In [11]:
genre_set = set()
for genres in df_combined['genres']:
    genres = ast.literal_eval(genres)
    for genre in genres:
        genre_set.add(genre['name'])

genre_list = list(genre_set)
for genre in genre_list:
    df_combined[genre] = np.zeros(len(df_combined)).astype('int')

In [12]:
for i in range(len(df_combined)):
    genres = ast.literal_eval(df_combined.genres[i])
    for genre in genres:
        df_combined.loc[i, genre['name']] = 1

In [13]:
df_combined.drop('genres', axis=1, inplace = True)

In [14]:
df_combined.columns

Index(['adult', 'budget', 'id', 'original_language', 'production_countries',
       'revenue', 'runtime', 'spoken_languages', 'views', 'likes', 'dislikes',
       'n_trailers', 'daysStart', 'dayInYear', 'Foreign', 'TV Movie',
       'History', 'Thriller', 'Science Fiction', 'Music', 'Romance', 'Comedy',
       'Mystery', 'Fantasy', 'Horror', 'Documentary', 'Western', 'Adventure',
       'Action', 'Animation', 'Family', 'Crime', 'Drama', 'War'],
      dtype='object')

In [15]:
df_combined_final = df_combined[df_combined['TV Movie'] == 0]

In [16]:
# df_combined_final.to_csv('data2/dategenre_parse.csv', index = False)