In [1]:
# timeit

# Student Name : Georgina Canela Ferre
# Cohort       : 3 - Castro

# Note: You are only allowed to submit ONE final model for this assignment.


################################################################################
# Import Packages
################################################################################

# use this space for all of your package imports

import pandas                  as pd                     # data science essentials
import seaborn                 as sns                    # enhanced graphical output
import matplotlib.pyplot       as plt                    # essential graphical output
import numpy                   as np                     # to make mathematical calculations
from matplotlib.ticker         import FormatStrFormatter # to customize x-axis

### ML packages
# Estimators
import statsmodels.formula.api as smf                    # linear regression (statsmodels)
from  sklearn.model_selection import train_test_split    # train/test split
from sklearn.linear_model     import LinearRegression    # linear regression (scikit-learn)
import sklearn.linear_model                              # linear model
from sklearn.ensemble         import GradientBoostingRegressor

#Preprocessing
from sklearn.preprocessing import RobustScaler

################################################################################
# Load Data
################################################################################

# use this space to load the original dataset
# MAKE SURE TO SAVE THE ORIGINAL FILE AS original_df
# Example: original_df = pd.read_excel('Apprentice Chef Dataset.xlsx')

original_df = pd.read_excel('Apprentice_Chef_Dataset.xlsx')


################################################################################
# Feature Engineering and (optional) Dataset Standardization
################################################################################

# use this space for all of the feature engineering that is required for your
# final model

#Meal 4
med_meal_rat_4 = 4
original_df['med_meal_rat_4'] = 0
condition = original_df.loc[0:,'med_meal_rat_4'][med_meal_rat_4 == original_df['MEDIAN_MEAL_RATING']]
original_df['med_meal_rat_4'].replace(to_replace = condition,
                                value      = 1,
                                inplace    = True)

#Final Orders
original_df['final_orders'] = original_df['TOTAL_MEALS_ORDERED'] - original_df['CANCELLATIONS_BEFORE_NOON']

#Meals Options
original_df['meals_options'] = original_df['UNIQUE_MEALS_PURCH']/original_df['TOTAL_MEALS_ORDERED']

#Total Orders Contact Cust Services
original_df['total_orders_custserv'] = original_df['CONTACTS_W_CUSTOMER_SERVICE'] /  original_df['TOTAL_MEALS_ORDERED']

original_df['LOG_REVENUE'] = np.log(original_df['REVENUE'])

# if your final model requires dataset standardization, do this here as well

df_scaled = original_df.copy()
df_scaled = df_scaled.drop(['NAME', 'EMAIL', 'FIRST_NAME','FAMILY_NAME'], axis = 1)

#Scaling
scaler = RobustScaler().fit(df_scaled)
scaler.transform(df_scaled)


################################################################################
# Train/Test Split
################################################################################

# use this space to set up testing and validation sets using train/test split

# Note: Be sure to set test_size = 0.25

y = df_scaled.copy()

X =['TOTAL_MEALS_ORDERED',            
    'UNIQUE_MEALS_PURCH',           
    'CONTACTS_W_CUSTOMER_SERVICE',   
    'AVG_TIME_PER_SITE_VISIT',    
    'AVG_PREP_VID_TIME',           
    'LARGEST_ORDER_SIZE',           
    'MASTER_CLASSES_ATTENDED',         
    'MEDIAN_MEAL_RATING',            
    'TOTAL_PHOTOS_VIEWED',
    'meals_options',
    'med_meal_rat_4',
    'final_orders',
    'total_orders_custserv'
]

#Target Variable 
independent_data = df_scaled[X]
y = df_scaled.loc[:, 'LOG_REVENUE']

# running train/test split again
X_train, X_test, y_train, y_test = train_test_split(independent_data,
                                                    y,
                                                    test_size    = 0.25,
                                                    random_state = 222)



################################################################################
# Final Model (instantiate, fit, and predict)
################################################################################

# use this space to instantiate, fit, and predict on your final model


#Gradient Boosting
gbt = GradientBoostingRegressor(
                                n_estimators = 155,
                                max_depth = 2, 
                                min_samples_leaf = 114
)

gbt_model = gbt.fit(X_train, y_train)
y_pred = gbt.predict(X_test)


# print('Training Score:', gbt.score(X_train, y_train).round(3))
# print('Testing Score:',  gbt.score(X_test, y_test).round(3))


gbt_score_train = gbt.score(X_train, y_train).round(3)
gbt_score_test  = gbt.score(X_test, y_test).round(3)


################################################################################
# Final Model Score (score)
################################################################################

# use this space to score your final model on the testing set
# MAKE SURE TO SAVE YOUR TEST SCORE AS test_score
# Example: test_score = final_model.score(X_test, y_test)

test_score = gbt.score(X_test, y_test)
