In [None]:
%matplotlib inline
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as datetime
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

print("Python Version:", sys.version)

In [None]:
data_imdb.head()

In [None]:
"""
This was done to analyze the different datasets.
The imdb_mojo dataset was prepared and chosen for further analysis
"""
#Read the four available datasets and rename columns. 
data_bxoff = pd.read_pickle('box_office.pkl')
data_topdoc = pd.read_pickle('top_doc.pkl')
data_imdb = pd.read_csv('IMDB_byVote.csv')
data_mojo = pd.read_pickle('mojo.pkl')
data_bxoff.rename(columns={'Rank':'Rank_BX', 'Year':'Year', 
                          'Title':'Title', 'Worldwide Box Office':'Worldwide_Box_Office_BX',
                          'Domestic Box Office':'Domestic_Box_Office_BX', 
                          'International Box Office':'International_Box_Office_BX'}, inplace = True)
data_topdoc.rename(columns = {'Title':'Title', 'User_Rating_Top_Doc':'User Rating_TD', 'Years':'Year_TD',
                              'Category':'Category'}, inplace = True)
data_imdb.rename(columns ={'Title':'Title', 'User Rating_IMDB':'User_Rating_IMDB', 'Metacritic':'Metacritic',
                               'Number of Votes_IMDB':'Number_of_votes_IMDB', 'Running Time':'Running_Time', 
                               'Release Date_IMDB':'Release_Date_IMDB', 'Director':'Director'}, inplace = True)
data_mojo.rename(columns = {'Mojo Rank':'Rank_MOJO_by_Box_office', 'Title':'Title', 'Lifetime Gross':'Lifetime_Gross_MOJO',
                            'Max Theaters':'Max_Theaters', 'Opening':'Opening_Box', 'Opening Th':'Opening_Th', 'Release Date':'Release_Date',
                            'Distributor':'Distributor'}, inplace = True)

#Drop uncessary columns
data_imdb.drop('Unnamed: 0', axis = 1, inplace = True)
#Strip uncessary characters in distributor column
data_mojo = data_mojo.apply(lambda x: x.str.strip('\n \n') if x.dtype == "object" else x)

#Examine the shape of the datasets
print('The shape of the IMDB data is: {}.'.format(data_imdb.shape))
print('The shape of Box Office Mojo data is: {}'.format(data_mojo.shape))
print('The shape of Top Documentary data is: {}.'.format(data_topdoc.shape))
print('The shape of the-numbers box office data is: {}'.format(data_bxoff.shape))

#Datasets were merged with IMDB dataset on title
#Merging on titles caused the dataset to become smaller
#IMDB merged with Box Office Mojo data gave the largest dataset to work with, 715 rows.
imdb_topdoc = pd.merge(data_topdoc, data_imdb, on=['Title'])
imdb_bxoff = pd.merge(data_bxoff, data_imdb, on=['Title'])
imdb_mojo = pd.merge(data_mojo, data_imdb, on=['Title'])

print('The shape of IMDB - Box Office Mojo data is: {}'.format(imdb_mojo.shape))
print('The shape of IMDB - Top Documentary data is: {}.'.format(imdb_topdoc.shape))
print('The shape of IMDB - the-numbers box office data is: {}'.format(imdb_bxoff.shape))

In [None]:
imdb_mojo.head()

In [None]:
#Save IMDB_mojo merged dataset as CSV 
#imdb_mojo.to_csv('imdb_mojo.csv')
imdb_mojo= pd.read_csv('imdb_mojo.csv')

In [None]:
print('The shape of IMDB merged with Box Office Mojo data is: {}'.format(imdb_mojo.shape))

In [None]:
#Clean data by removing '$',',', and '-'. 
#Convert values from string to numeric
imdb_mojo['Lifetime_Gross_MOJO'] = imdb_mojo['Lifetime_Gross_MOJO'].replace({'\$':''}, regex = True)
imdb_mojo['Lifetime_Gross_MOJO'] = imdb_mojo['Lifetime_Gross_MOJO'].replace({',':''}, regex = True)
imdb_mojo['Lifetime_Gross_MOJO'] = imdb_mojo['Lifetime_Gross_MOJO'].replace({'-':''}, regex = True)
imdb_mojo['Lifetime_Gross_MOJO'] = pd.to_numeric(imdb_mojo['Lifetime_Gross_MOJO'])

imdb_mojo['Opening_Box'] = imdb_mojo['Opening_Box'].replace({'\$':''}, regex = True)
imdb_mojo['Opening_Box'] = imdb_mojo['Opening_Box'].replace({',':''}, regex = True)
imdb_mojo['Opening_Box'] = imdb_mojo['Opening_Box'].replace({'-':''}, regex = True)
imdb_mojo['Opening_Box'] = pd.to_numeric(imdb_mojo['Opening_Box'])

imdb_mojo['Number_of_votes_IMDB'] = imdb_mojo['Number_of_votes_IMDB'].replace({',':''}, regex = True)
imdb_mojo['Number_of_votes_IMDB'] = pd.to_numeric(imdb_mojo['Number_of_votes_IMDB'])

imdb_mojo['Rank_MOJO_by_Box_office'] = imdb_mojo['Rank_MOJO_by_Box_office'].replace({',':''}, regex = True)
imdb_mojo['Rank_MOJO_by_Box_office'] = pd.to_numeric(imdb_mojo['Rank_MOJO_by_Box_office'])

imdb_mojo['Max_Theaters'] = imdb_mojo['Max_Theaters'].replace({',':''}, regex = True)
imdb_mojo['Max_Theaters'] = imdb_mojo['Max_Theaters'].replace({'-':''}, regex = True)
imdb_mojo['Max_Theaters'] = pd.to_numeric(imdb_mojo['Max_Theaters'])

imdb_mojo['Opening_Th'] = imdb_mojo['Opening_Th'].replace({'-':''}, regex = True)
imdb_mojo['Opening_Th'] = imdb_mojo['Opening_Th'].replace({',':''}, regex = True)
imdb_mojo['Opening_Th'] = pd.to_numeric(imdb_mojo['Opening_Th'])

imdb_mojo['User_Rating_IMDB'] = pd.to_numeric(imdb_mojo['User_Rating_IMDB'])
imdb_mojo['Metacritic'] = pd.to_numeric(imdb_mojo['Metacritic'])



In [None]:
#Both IMDB and Box Office Mojo datasets had release date, drop one of them
imdb_mojo.drop(['Release_Date_IMDB','Unnamed: 0'], inplace = True, axis = 1)

In [None]:
#Convert to datetime
#Use time delta to find difference between the select documentary and the oldest documentary in the list
imdb_mojo['Release_Date'] = pd.to_datetime(imdb_mojo['Release_Date']) 
imdb_mojo['Release_Difference'] = (imdb_mojo['Release_Date']-imdb_mojo['Release_Date'].min())/np.timedelta64(1,'D')
imdb_mojo.drop(['Release_Date'], inplace = True, axis = 1)

In [None]:
imdb_mojo.head()

In [None]:
#filter out rows without a metacritic score
has_metacritic_df = imdb_mojo[imdb_mojo.Metacritic.notnull()]

In [None]:
has_metacritic_df.describe()

In [None]:
has_metacritic_df.head()

In [None]:
no_metacritic_df = imdb_mojo[imdb_mojo.Metacritic.isnull()]

In [None]:
no_metacritic_df.shape

In [None]:
#There are 715 films and 149 distributors, with 353 titles made by 15 companies. Make 15 dummmy variables.
#Categorize distributors that make less than 10 movies as 'other'.
distributor_counts = imdb_mojo['Distributor'].value_counts()
distributor_counts.head(50)

In [None]:
#Will make the correlation with box office numbers positive, before reversing, lots of negative correlations
#imdb_mojo.Rank_MOJO_by_Box_office =  imdb_mojo.Rank_MOJO_by_Box_office.values[::-1]

"""
def new_rank(df):
    for i in imdb_mojo['Rank_MOJO_by_Box_office']:
        new_rank = 2055-i +1
        imdb_mojo['Rank_MOJO_by_Box_office'][i] = new_rank
#imdb_mojo.Rank_MOJO_by_Box_office =  imdb_mojo.Rank_MOJO_by_Box_office.values(2055-the current rank +1)
new_rank(imdb_mojo)
"""


In [None]:
#Will drop to 524 rows.
#imdb_mojo = imdb_mojo.dropna()

In [None]:
imdb_mojo.dtypes

In [None]:
imdb_mojo.head()

In [None]:
has_metacritic_df.corr()

In [None]:
'''
data_bxoff.to_csv('data_bxoff.csv')
data_topdoc.to_csv('data_topdoc.csv')
data_imdb.to_csv('data_imdb.csv')
data_mojo.to_csv('data_mojo.csv')

In [None]:
#Drop Nas in imdb_mojo dataframe
has_metacritic_df = has_metacritic_df.dropna()

In [None]:
has_metacritic_df.describe()

In [None]:
sns.pairplot(has_metacritic_df, kind = 'scatter')

In [None]:
sns.pairplot(has_metacritic_df, kind = 'kde')

In [None]:
#Using user rating by IMDB to predict Metacritic score
lr_full = LinearRegression()
X = has_metacritic_df['User_Rating_IMDB'].values.reshape(-1,1)
y = has_metacritic_df['Metacritic']
lr_full.fit(X,y)
lr_full.score(X,y)

In [None]:
#Using three features; user rating by IMDB, Mojo ranking by box office, and max number of theaters
lr_full = LinearRegression()
X = has_metacritic_df.loc[:,['User_Rating_IMDB','Rank_MOJO_by_Box_office','Max_Theaters']]
y = has_metacritic_df['Metacritic']
lr_full.fit(X,y)
lr_full.score(X,y)

In [None]:
#Using max theaters to predict lifetime gross
lr_full = LinearRegression()
X = has_metacritic_df['Max_Theaters'].values.reshape(-1,1)
y = has_metacritic_df['Lifetime_Gross_MOJO']
lr_full.fit(X,y)
lr_full.score(X,y)

In [None]:
x = has_metacritic_df.Metacritic.values.reshape(-1,1)
normalized_meta = preprocessing.normalize(x)
normalized_meta

In [None]:
x2 = has_metacritic_df.User_Rating_IMDB.values.reshape(-1,1)
normalized_user_rate = preprocessing.normalize(x2)
normalized_user_rate

In [None]:
lr_full = LinearRegression()
X = 
                                
                                    