In [2]:
# importing libraries
import pandas as pd # data science essentials
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # enhanced data visualization
import statsmodels.formula.api as smf # regression modeling
from sklearn.model_selection import train_test_split # train/test split
from sklearn.linear_model import LinearRegression

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# specifying the path and file name
file = './Apprentice_Chef_Dataset.xlsx'

# reading the file into Python
chef = pd.read_excel(file)

# preparing explanatory variable data
chef_data = chef.drop(['REVENUE',
                       'NAME',
                       'EMAIL',
                       'FIRST_NAME',
                       'FAMILY_NAME'
                      ],axis=1 )

# preparing response variable data
chef_target = chef.loc[ : , 'REVENUE']

#use log to adjust data
chef['log_REVENUE'] = np.log10(chef['REVENUE'])
log_chef_target = chef.loc[ : , 'log_REVENUE']# ready for use later


# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            chef_data,
            chef_target,
            test_size = 0.25,
            random_state = 219)

# checking the shapes of the dataset if needed
#print(f"""Training DataX-side: {x_train.shape}y-side: {y_train.shape}Testing DataX-side: {x_test.shape}y-side: {y_test.shape}""")

# declaring set of x-variables
x_variables = ['TOTAL_MEALS_ORDERED', 'UNIQUE_MEALS_PURCH', 'CONTACTS_W_CUSTOMER_SERVICE',
               'AVG_TIME_PER_SITE_VISIT', 'AVG_PREP_VID_TIME','LARGEST_ORDER_SIZE', 
               'MASTER_CLASSES_ATTENDED', 'MEDIAN_MEAL_RATING','TOTAL_PHOTOS_VIEWED']
# looping to make x-variables suitable for statsmodels if needed
#for val in x_variables:print(f"{val} +")

# merging X_train and y_train so that they can be used in statsmodels
chef_train = pd.concat([x_train, y_train], axis = 1)
# merging X_train and y_train so that they can be used in statsmodels
chef_train = pd.concat([x_train, y_train], axis = 1)


#build a model to see
lm_best = smf.ols(formula =  """REVENUE ~
TOTAL_MEALS_ORDERED +
UNIQUE_MEALS_PURCH +
CONTACTS_W_CUSTOMER_SERVICE +
AVG_TIME_PER_SITE_VISIT +
AVG_PREP_VID_TIME +
LARGEST_ORDER_SIZE +
MASTER_CLASSES_ATTENDED +
MEDIAN_MEAL_RATING +
TOTAL_PHOTOS_VIEWED""",
                                data = chef_train)


# Step 2: fit the model based on the data
results = lm_best.fit()
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                REVENUE   R-squared:                       0.628
Model:                            OLS   Adj. R-squared:                  0.625
Method:                 Least Squares   F-statistic:                     271.5
Date:                Mon, 15 Feb 2021   Prob (F-statistic):          1.91e-303
Time:                        21:49:03   Log-Likelihood:                -11596.
No. Observations:                1459   AIC:                         2.321e+04
Df Residuals:                    1449   BIC:                         2.326e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [2]:

# applying model in scikit-learn

# Preparing a DataFrame based the the analysis above
ols_data   = chef.loc[ : , x_variables]


# FULL X-dataset (normal Y)
x_train_FULL, x_test_FULL, y_train_FULL, y_test_FULL = train_test_split(
            chef_data,     # x-variables
            chef_target,   # y-variable
            test_size = 0.25,
            random_state = 219)


# OLS p-value x-dataset (normal Y)
x_train_OLS, x_test_OLS, y_train_OLS, y_test_OLS = train_test_split(
            ols_data,         # x-variables
            chef_target,   # y-variable
            test_size = 0.25,
            random_state = 219)

# INSTANTIATING a model object
lr = LinearRegression()


# FITTING to the training data
lr_fit = lr.fit(x_train_OLS, y_train_OLS)


# PREDICTING on new data
lr_pred = lr_fit.predict(x_test_OLS)



# saving scoring data for future use
lr_train_score = lr.score(x_train_OLS, y_train_OLS).round(4) # using R-square
lr_test_score  = lr.score(x_test_OLS, y_test_OLS).round(4)   # using R-square


# displaying and saving the gap between training and testing
lr_test_gap = abs(lr_train_score - lr_test_score).round(4)

# zipping each feature name to its coefficient
lr_model_values = zip(chef_data[x_variables].columns,
                      lr_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lr_model_lst = [('intercept', lr_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in lr_model_values:
    lr_model_lst.append(val)
    


In [3]:
import sklearn.linear_model # linear models
# INSTANTIATING a model object
lasso_model = sklearn.linear_model.Lasso(alpha     = 1.0,  # default shrinkage
                                         normalize = True) # default magitude
# FITTING to the training data
lasso_fit = lasso_model.fit(x_train_FULL, y_train_FULL)

# PREDICTING on new data
lasso_pred = lasso_fit.predict(x_test_FULL)

## the following code has been provided for you ##

# saving scoring data for future use
lasso_train_score = lasso_model.score(x_train_FULL, y_train_FULL).round(4) # using R-square
lasso_test_score  = lasso_model.score(x_test_FULL, y_test_FULL).round(4)   # using R-square

# displaying and saving the gap between training and testing
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

lasso_model_values = zip(chef_data.columns, lasso_fit.coef_.round(decimals = 4))


# setting up a placeholder list to store model features
lasso_model_lst = [('intercept', lasso_fit.intercept_.round(decimals = 4))]


# printing out each feature-coefficient pair one by one
for val in lasso_model_values:
    lasso_model_lst.append(val)
    

# dropping coefficients that are equal to zero

# printing out each feature-coefficient pair one by one
for feature, coefficient in lasso_model_lst:
        
        if coefficient == 0:
            lasso_model_lst.remove((feature, coefficient))



In [6]:
# INSTANTIATING a model object
ard_model = sklearn.linear_model.ARDRegression(normalize  = False)


# FITTING the training data
ard_fit = ard_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
ard_pred = ard_fit.predict(x_test_FULL)


# saving scoring data for future use
ard_train_score = ard_model.score(x_train_FULL, y_train_FULL).round(4)
ard_test_score  = ard_model.score(x_test_FULL, y_test_FULL).round(4)


# displaying and saving the gap between training and testing
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

# zipping each feature name to its coefficient
ard_model_values = zip(chef_data.columns, ard_fit.coef_.round(decimals = 5))

# setting up a placeholder list to store model features
ard_model_lst = [('intercept', ard_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in ard_model_values:
    ard_model_lst.append(val)

# printing out each feature-coefficient pair one by one
for feature, coefficient in ard_model_lst:
        
        if coefficient == 0:
            ard_model_lst.remove((feature, coefficient))
# comparing results

# creating a dictionary for model results
model_performance = {
    
    'Model Type'    : ['(Final)OLS', 'Lasso', 'ARD'],
           
    'Training' : [lr_train_score, lasso_train_score,
                                   ard_train_score],
           
    'Testing'  : [lr_test_score, lasso_test_score,
                                   ard_test_score],
                    
    'Train-Test Gap' : [lr_test_gap, lasso_test_gap,
                                        ard_test_gap],
                    
    'Model Size' : [len(lr_model_lst), len(lasso_model_lst),
                                    len(ard_model_lst)],
                    
    'Model' : [lr_model_lst, lasso_model_lst, ard_model_lst]}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)

print(model_performance)

   Model Type  Training  Testing  Train-Test Gap  Model Size                                              Model
0  (Final)OLS    0.6278   0.6599          0.0321          10  [(intercept, -372.06), (TOTAL_MEALS_ORDERED, 5...
1       Lasso    0.6171   0.6402          0.0231          15  [(intercept, 36.9401), (TOTAL_MEALS_ORDERED, 5...
2         ARD    0.6309   0.6582          0.0273          24  [(intercept, 79.76), (CROSS_SELL_SUCCESS, -62....
