In [70]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None

# Read in the data
df = pd.read_csv('../input/movies/movies.csv')

In [71]:
# Looking at the data
df.head()

In [72]:
# Checking if there is any missing data
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, pct_missing))

In [73]:
# Data types for columns
df.dtypes

In [74]:
# Dropping missing values
df = df.dropna()

In [95]:
# Checking for outliers
df.boxplot(column=['gross'])

In [75]:
# Change data type of budget and gross columns
df['budget'] = df['budget'].astype('int64')

df['gross'] = df['gross'].astype('int64')

In [76]:
pd.set_option('display.max_rows', 20)
df

In [77]:
# Create correct year column
df['year_correct'] = df['released'].str.extract(pat='([0-9]{4})').astype(int)
df

In [78]:
# Sorting data by gross descending
df = df.sort_values(by=['gross'], inplace=False, ascending=False)

In [79]:
# Drop any duplicates
df['company'].drop_duplicates().sort_values(ascending=False)

In [80]:
df

In [101]:
# Scatter plot with budget vs gross


plt.scatter(x=df['budget'], y=df['gross'], alpha=0.5)
plt.title('Budget vs Gross Earnings')
plt.xlabel('Budget for Film')
plt.ylabel('Gross Earnings')
plt.show()

In [82]:
df.head()

In [112]:
# Plot budget vs gross using Seaborn
sns.regplot(x='budget', y='gross', data=df, scatter_kws={'alpha': 0.5}, line_kws={'color': 'blue'})

In [84]:
# Looking at correlations
df.corr(method='pearson')

In [96]:
df.corr(method ='kendall')

In [97]:
df.corr(method ='spearman')

In [85]:
correlation_matrix = df.corr(method='pearson')

sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [86]:
# Looking at company
df.head()

In [65]:
df_numerized = df

for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
        
df_numerized

In [87]:
df

In [88]:
correlation_matrix = df_numerized.corr(method='pearson')

sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [89]:
df_numerized.corr()

In [91]:
correlation_mat = df_numerized.corr()

corr_pairs = correlation_mat.unstack()

corr_pairs

In [93]:
sorted_pairs = corr_pairs.sort_values()

sorted_pairs

In [94]:
# Checking which ones have a correlation higher than 0.5
high_corr = sorted_pairs[(sorted_pairs) > 0.5]

high_corr

In [None]:
# Votes and Budget had the highest correlation to Gross Earnings.
# Company had no correlation

In [116]:
# Gross vs Rating
sns.stripplot(x="rating", y="gross", data=df)

In [114]:
# Looking at the top 15 companies by gross revenue

CompanyGrossSum = df.groupby('company')[["gross"]].sum()

CompanyGrossSumSorted = CompanyGrossSum.sort_values('gross', ascending = False)[:15]

CompanyGrossSumSorted = CompanyGrossSumSorted['gross'].astype('int64') 

CompanyGrossSumSorted