In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import re
import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import matplotlib.pyplot as plt
from IPython.display import display
pd.options.display.max_columns = None

In [2]:
#years = [2008,2017]   #Comment in/out year ranges
years = [1988,1997] 
movie_df = pd.read_csv('../04_Data/{}_{}_merged.csv'.format(years[0],years[-1]), index_col=0)

In [3]:
movie_df.head()

Unnamed: 0,3d,genre,imax,intl_box,mpaa,open,prod_budget,rundays,runtime,series,theaters,title,usa_box,usa_open
0,0.0,Drama,0.0,354825435.0,R,1988-12-16,25000000.0,,133.0,0.0,1590.0,Rain Man,172825435.0,7005719.0
1,0.0,Fantasy Comedy,0.0,329803958.0,PG,1988-06-24,70000000.0,,103.0,0.0,1598.0,Who Framed Roger Rabbit,156452370.0,11226239.0
3,0.0,Fantasy Comedy,0.0,151668774.0,PG,1988-06-03,18000000.0,,104.0,0.0,1419.0,Big,114968774.0,8216190.0
4,0.0,Comedy,0.0,216614388.0,PG,1988-12-09,15000000.0,,105.0,0.0,1659.0,Twins,111938388.0,11174980.0
5,0.0,Adventure Comedy,0.0,239606210.0,PG,1988-05-25,14000000.0,,110.0,1.0,2837.0,Crocodile Dundee II,109306210.0,24462976.0


In [4]:
def find_genre(row, genre):
    try: 
        if re.search(genre, row):
            return 1
        else: return 0
    except: 0 

In [5]:
movie_df["genre"].value_counts().head(25)

Comedy              226
Unknown             216
Drama               130
Romantic Comedy      97
Foreign              80
Action               76
Thriller             76
Family Comedy        65
Horror               65
Comedy / Drama       62
Romance              61
Animation            60
Family Adventure     56
Action Comedy        52
Crime Drama          51
Period Drama         48
Drama / Thriller     36
Documentary          35
Action Thriller      32
Crime Comedy         28
Crime Thriller       25
Music Drama          25
Sci-Fi Action        24
Horror Comedy        23
Fantasy Comedy       23
Name: genre, dtype: int64

In [6]:
#Create Columns for each Unique Genre Category in the top 25
movie_df["action"] = movie_df["genre"].apply(find_genre, genre="Action")        #1 
movie_df["adventure"] = movie_df["genre"].apply(find_genre, genre="Adventure")  #X (combine with Action)
movie_df["comedy"] = movie_df["genre"].apply(find_genre, genre="Comedy")        #2
movie_df["drama"] = movie_df["genre"].apply(find_genre, genre="Drama")          #3
movie_df["horror"] = movie_df["genre"].apply(find_genre, genre="Horror")        #4
movie_df["musical"] = movie_df["genre"].apply(find_genre, genre="Musical")      #X (combine with Comedy)
movie_df["scifi"] = movie_df["genre"].apply(find_genre, genre="Sci")            #5
movie_df["romance"] = movie_df["genre"].apply(find_genre, genre="Roman")        #6
movie_df["animation"] = movie_df["genre"].apply(find_genre, genre="Animat")     #7
#movie_df["foreign"] = movie_df["genre"].apply(find_genre, genre="Foreign")     #X (withdrawn - too niche)
movie_df["thriller"] = movie_df["genre"].apply(find_genre, genre="Thrill")      #X (combine with Action)
#movie_df["documentary"] = movie_df["genre"].apply(find_genre, genre="Document")#X (withdrawn - too niche)
movie_df["fantasy"] = movie_df["genre"].apply(find_genre, genre="Fantasy")      #X (combine with SciFi)

In [7]:
#Compress categories. 
movie_df["comedy"] = (movie_df["comedy"] + movie_df["musical"]) #based on Golden Globes Category Comedy/Musical
#these categories are really similar
movie_df["action"] = (movie_df["action"] + movie_df["adventure"] + movie_df["thriller"])
movie_df["scifi/fantasy"] = movie_df["scifi"] + movie_df["fantasy"]

#make sure each compressed category is max value 1
movie_df["comedy"] = movie_df["comedy"].apply(lambda x: min(1, x))
movie_df["action"] = movie_df["action"].apply(lambda x: min(1, x))
movie_df["scifi/fantasy"] = movie_df["scifi/fantasy"].apply(lambda x: min(1, x))

movie_df.drop(["musical","adventure","fantasy","thriller","scifi"],axis=1,inplace=True)

In [8]:
#Create Columns To Sum the Number of Genres Present as a way to find the Zeros
movie_df["genre_counts"] =  (movie_df["action"] + movie_df["comedy"] +
                             movie_df["drama"] + movie_df["horror"] + movie_df["scifi/fantasy"] +
                             movie_df["romance"] + movie_df["animation"] )

#Create a Genre for 'Other'
movie_df["other"] = movie_df["genre_counts"].apply(lambda x: 1 if x == 0 else 0)

In [9]:
#create unique ID from movie open date and title
def time2monthyearstring(row):
    try:
        return row[0:4]
    except: pass

#truncate the year
movie_df["year"] = movie_df["open"].apply(time2monthyearstring)

In [10]:
#this resets intdex before dummifying the MPAA rating
movie_df = movie_df.reset_index()      

#this creates dummy variables for the MPAA rating
movie_df = pd.merge(movie_df, pd.get_dummies(movie_df["mpaa"]),left_index=True, right_index=True)

In [11]:
movie_df.head(3)

Unnamed: 0,index,3d,genre,imax,intl_box,mpaa,open,prod_budget,rundays,runtime,series,theaters,title,usa_box,usa_open,action,comedy,drama,horror,romance,animation,scifi/fantasy,genre_counts,other,year,G,NC-17,PG,PG-13,R,Unrated
0,0,0.0,Drama,0.0,354825435.0,R,1988-12-16,25000000.0,,133.0,0.0,1590.0,Rain Man,172825435.0,7005719.0,0,0,1,0,0,0,0,1,0,1988,0,0,0,0,1,0
1,1,0.0,Fantasy Comedy,0.0,329803958.0,PG,1988-06-24,70000000.0,,103.0,0.0,1598.0,Who Framed Roger Rabbit,156452370.0,11226239.0,0,1,0,0,0,0,1,2,0,1988,0,0,1,0,0,0
2,3,0.0,Fantasy Comedy,0.0,151668774.0,PG,1988-06-03,18000000.0,,104.0,0.0,1419.0,Big,114968774.0,8216190.0,0,1,0,0,0,0,1,2,0,1988,0,0,1,0,0,0


In [12]:
movie_df.to_csv('../04_Data/{}_{}_merged_featurized.csv'.format(years[0],years[-1]))