## Setup

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure

# Output will show all rows
pd.set_option('display.max_rows', None)

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

# Load data
df = pd.read_csv('../input/movies/movies.csv')

In [None]:
df.head()

## Hypothesis

Which movie factors are highly correlated with gross revenue?

Prediction: **Movie budget and main actor (`star`)** will have a strong positive correlation to gross revenue.

## Data Cleaning

In [None]:
# Percent of null values in each column

for col in df.columns:
    n_missing = sum(df[col].isnull())
    pct_missing = np.mean(df[col].isnull())
    print(f'{col} : {pct_missing}% ({n_missing})')

In [None]:
df.dtypes

In [None]:
# Change data types

df['votes'] = df['votes'].astype('Int64')
df['budget'] = df['budget'].astype('Int64')
df['gross'] = df['gross'].astype('Int64')

df.dtypes

In [None]:
# Problem: `year` column is inaccurate
# Solution: extract the correct year from `released` column

released_df = df['released'].str.split(' ',n = 3,expand = True)
released_df.rename(columns={0: 'month', 1: 'day', 2: 'year', 3: 'country'}, inplace = True)

# Add new columns to df
df['released_year'] = released_df['year']
df['released_month'] = released_df['month']
df['released_day'] = released_df['day']

# 'released_year' will replace 'year'
df.drop(['year'], axis = 1, inplace = True)

df.head()

In [None]:
df.shape

In [None]:
# Remove redundant rows

df = df.drop_duplicates()
# df = df.dropna()
df.shape

## Analysis

In [None]:
# Order films by gross revenue
df.sort_values(by=['gross'], ascending = False).head(25)

In [None]:
# Scatterplot: budget vs gross revenue

df['gross'] = df['gross'].astype('float')
df['budget'] = df['budget'].astype('float')

plt.scatter(x = df['budget'], y = df['gross'])
plt.title('Budget vs Gross Revenue')
plt.xlabel('Budget')
plt.ylabel('Gross Revenue')
plt.show()

In [None]:
# Regression plot: how much is budget correlated to gross revenue?

sns.regplot(data = df, x = 'budget', y = 'gross', 
            scatter_kws = {'color': 'red'},
            line_kws = {'color': 'blue'})
plt.title('Budget vs Gross Revenue')
plt.xlabel('Budget')
plt.ylabel('Gross Revenue')
plt.show()

In [None]:
# Confirm high correlation b/w budget and gross (0.74)
df.corr(method = 'pearson')

In [None]:
# Heatmap 

correlation_matrix = df.corr(method = 'pearson')
sns.heatmap(correlation_matrix, annot = True)
plt.title('Correlation Matrix for Numeric Movie Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
# Change all column values to numbers in order to run a (bigger) correlation matrix
df_numerized = df

for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
        
df_numerized.head(25)

In [None]:
# Correlations b/w all variables
df_numerized.corr()

In [None]:
# Heatmap with all variables
correlation_matrix_num = df_numerized.corr(method = 'pearson')
sns.heatmap(correlation_matrix_num, annot = True)

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(20, 13)
plt.title('Correlation Matrix for Numeric Movie Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
# Create correlation pairs for each variable
correlation_pairs = correlation_matrix_num.unstack()

# Sort correlation pairs by correlation value 
sorted_pairs = correlation_pairs.sort_values()
sorted_pairs

In [None]:
# Show only strong positive correlations
sorted_pairs[sorted_pairs > 0.5]

## Conclusion 

**Number of ratings (`votes`) and budget** have the highest correlation to gross earnings