# A. Data Analysis

In [26]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm

In [27]:
# Load data
dataset = pd.read_csv('DFMarketAnalysisAssignmentData.csv')
#print dataset.head()

# inputs = spend
# outputs = enrollments
xSpendMailers = dataset.iloc[:, 2].values.reshape((-1, 1))
xSpendRadio = dataset.iloc[:, 3].values.reshape((-1, 1))
yEnrollments = dataset.iloc[:, 1].values

#print(yEnrollments)
#print(xSpendMailers)
#print(xSpendRadio)

## Regression - Mailers

In [28]:
# Set up the model
model = LinearRegression()
# Fit the model
model.fit(xSpendMailers, yEnrollments)
# Check the score
r_sq = model.score(xSpendMailers, yEnrollments)
print('coefficient of determination:', r_sq)
print('intercept:', model.intercept_)
print('slope:', model.coef_)

coefficient of determination: 0.1908698431535677
intercept: 4.953125
slope: [0.01007813]


## Regression - Radio

In [29]:
model = LinearRegression()
model.fit(xSpendRadio, yEnrollments)
r_sq = model.score(xSpendRadio, yEnrollments)
print('coefficient of determination:', r_sq)
print('intercept:', model.intercept_)
print('slope:', model.coef_)

coefficient of determination: 0.004877215275969182
intercept: 5.288135593220339
slope: [0.00035593]


## Multiregression & p-value
##### Ran regression with both spends to see which had a lower p-value

In [30]:
X = np.column_stack((dataset['SpendMailers'], dataset['SpendRadio']))
y = dataset['Enrollments']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:            Enrollments   R-squared:                       0.205
Model:                            OLS   Adj. R-squared:                  0.181
Method:                 Least Squares   F-statistic:                     8.535
Date:                Thu, 05 Nov 2020   Prob (F-statistic):           0.000505
Time:                        21:57:54   Log-Likelihood:                -178.13
No. Observations:                  69   AIC:                             362.3
Df Residuals:                      66   BIC:                             369.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.7593      0.445     10.694      0.0

## Correlation

In [31]:
cc = dataset[["SpendMailers", "Enrollments"]].corr()
print('Mailers correlation')
print(cc)

print ('----------')

print('Radio correlation')
cc = dataset[["SpendRadio", "Enrollments"]].corr()
print(cc)

Mailers correlation:                SpendMailers  Enrollments
SpendMailers      1.000000     0.436887
Enrollments       0.436887     1.000000
             SpendRadio  Enrollments
SpendRadio     1.000000     0.069837
Enrollments    0.069837     1.000000


## Basic Analysis
##### Performed basic analysis verifying mailers were more efficient than radio

| Days      | Spend per day | Channel | Avg  | Min  | Max | Cost per acquisition |
|-----------|-------|--------|------|------|-----|----------------------|
| 54 Days   | 0     | na     | 4.76 | 2    | 15  | 0                   |
| Day 29-38 | 2000  | Radio  | 6    | 2    | 18  | 1613               |
| Day 63-67 | 600   | Mailer | 11   | 8    | 16  | 96                  |

## Verify mailers regression output
##### Ran regression using another Python library to verify my regression output was correct

In [33]:
X = dataset['SpendMailers']
y = dataset['Enrollments']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:            Enrollments   R-squared:                       0.191
Model:                            OLS   Adj. R-squared:                  0.179
Method:                 Least Squares   F-statistic:                     15.80
Date:                Fri, 06 Nov 2020   Prob (F-statistic):           0.000175
Time:                        07:20:34   Log-Likelihood:                -178.76
No. Observations:                  69   AIC:                             361.5
Df Residuals:                      67   BIC:                             366.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            4.9531      0.409     12.097   

# B. Programming Task

In your day-to-day, you may need to handle making videos from a large number of still images. Please write a script that takes as input a directory with an arbitrary number of still images, and creates an mp4 video. You may leverage FFMPEG.