In [11]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
from scipy import stats
import seaborn as sns
from __future__ import division

In [2]:
data_df = pd.read_csv('data.csv')
data_df

Unnamed: 0,Y,C,P,F
0,85.1,8.5,5.1,4.7
1,106.3,12.9,5.8,8.8
2,50.2,5.2,2.1,15.1
3,130.6,10.7,8.4,12.2
4,54.8,3.1,2.9,10.6
5,30.3,3.5,1.2,3.5
6,79.4,9.2,3.7,9.7
7,91.0,9.0,7.6,5.9
8,135.4,15.1,7.7,20.8
9,89.3,10.2,4.5,7.9


# a) Determine the Total Sales as a function of Production Cost, Promotion Cost, and First Year Box Office Sales  


In [3]:
import statsmodels.formula.api as smf
lm = smf.ols(formula='Y ~ C + P + F', data=data_df).fit()


In [4]:
lm.params

Intercept    7.676028
C            3.661604
P            7.621050
F            0.828468
dtype: float64

# Total Sales = 7.67 + 3.66(Production Cost) + 7.62(Promotion Cost) + 0.82(First Year Box Office) 

# b) Determine R2 


In [6]:
lm.rsquared

0.96678881966964347

# c) a)	Test the hypothesis at α = 5% level that Total Sales is not related any of Production Cost, Promotion Cost, and First Box Office Sales.


# Hypothesis:
# H0 -> b1 = b2 = b3 = 0 (Total Sales is not related any of Production Cost, Promotion Cost, and First Box Office Sales)
# Ha -> at least one of the regression coefficients should be non-zero.

# alpha = 5%

In [24]:
N = lm.nobs
P = lm.df_model
F = lm.mse_model / lm.mse_resid
dfn, dfd = P, N - P - 1
print "f-statistic:",F
p = 1.0 - scipy.stats.f.cdf(F,dfn,dfd)
print "p-value: %0.4f" %p

f-statistic: 58.2206841222
p-value: 0.0001


# Here in our model f -statistic = 58.22 and p - value is 0.0001
# p - value < 0.05
# We can reject the null


#   




# d) Find the correlation of  Total Sales and Production Cost given Promotion Cost (partial correlation)


In [46]:
np.corrcoef([data_df.Y,data_df.C,data_df.P,data_df.F])

array([[ 1.        ,  0.91744481,  0.92996775,  0.47469115],
       [ 0.91744481,  1.        ,  0.7899575 ,  0.4291329 ],
       [ 0.92996775,  0.7899575 ,  1.        ,  0.29876126],
       [ 0.47469115,  0.4291329 ,  0.29876126,  1.        ]])

In [51]:
r_YC = 0.9174
r_YP = 0.93
r_CP = 0.79
r_YF = 0.4746
r_FP = 0.2987
r_CF = 0.4291

In [41]:
r_YC_P = (r_YC - r_YP*r_CP) / np.sqrt((1 - (r_YP)**2)*(1 - (r_CP)**2))

In [42]:
r_YC_P

0.81072713669446683

# e)	Find the correlation of  Total Sales and Production Cost given Promotion Cost and First Year Box Office Sales (partial correlation)


r_YC_PF = (r_YC_P - r_YF_P*r_CF_P) / np.sqrt((1 - (r_YF_P)**2)*(1 - (r_CF_P)**2))

In [48]:
r_YF_P = (r_YF - r_YP*r_FP) / np.sqrt((1 - (r_FP)**2)*(1 - (r_FP)**2))

In [52]:
r_CF_P = (r_CF - r_CP*r_FP) / np.sqrt((1 - (r_FP)**2)*(1 - (r_FP)**2))

In [55]:
r_YC_PF = (r_YC_P - r_YF_P*r_CF_P) / np.sqrt((1 - (r_YF_P)**2)*(1 - (r_CF_P)**2))

In [70]:
print "ANS:",r_YC_PF

ANS: 0.801645229451


# f) Find the correlation of  Total Sales and First Year sales given Production Cost and Promotion Cost (partial correlation)


r_YF_PC = (r_YF_P - r_YC_F*r_FC_P) / np.sqrt((1 - (r_YC_P)**2)*(1 - (r_FC_P)**2))

In [57]:
r_YC_F = (r_YC - r_YF*r_CF) / np.sqrt((1 - (r_YF)**2)*(1 - (r_CF)**2))

In [62]:
r_YF_PC = (r_YF_P - r_YC_F*r_CF_P) / np.sqrt((1 - (r_YC_P)**2)*(1 - (r_CF_P)**2))

In [71]:
print "ANS:",r_YF_PC

ANS: 0.0449665096783


# g)What is the forecast for the total sales and a 95% confidence interval for it of a movie that costs 12,75 mm US to produce and 6 mm is spent on its promotion its first year box office sales is 8 mm?. 


 # y* ±Var(y*)1/2 t(1-α/2)
 
 # we have df = 10 - 3 - 1 = 6
 
 # so t(1 - 0.05/2) = 2.447

In [105]:
y_movie = 7.67 + 3.66*12.75 + 7.62*6 + 0.82*8

In [95]:
y_predicted = lm.predict(data_df[['C','P','F']])
y_std = np.std(y_predicted)
t = 2.447

In [106]:
SSE = 0
for y,yp in zip(data_df.Y,y_predicted):
    SSE = SSE + (y-yp)**2
SSE = np.sqrt(SSE / 10)

# forecast of total sales:

In [108]:
print y_predicted

[  81.56082106  106.40333497   55.23044547  120.97932724   49.90980948
   32.53654208   77.59681496  103.4384102   138.88047787   85.86401668]


In [102]:
print "ANS: 95% confidence interval:", (y_movie - SSE*2.447, y_movie + SSE*2.447)

95% confidence interval: (92.321492833615409, 120.90850716638461)
