## Import ALS Model code

In [None]:
import als_model

In [58]:
%matplotlib inline

## Run dataprep code to prepare data; import geocoder and manually fix one zip code missing due to query limit

In [72]:
import geocoder

In [75]:
run dataprep.py

Carkeek Warmer Carkeek Park, Seattle, WA <[OVER_QUERY_LIMIT] Google - Geocode [empty]> None


In [76]:
#manually add the one missing zipcode
g = geocoder.google("Carkeek Park, Seattle, WA")

In [77]:
g

<[OK] Google - Geocode [Carkeek Park, 950 NW Carkeek Park Rd, Seattle, WA 98177, USA]>

In [91]:
cleaned_df['Venue_Zip2'] = cleaned_df[['Venue_Zip', 'Event_Name']].apply(lambda row: 98177 if row[1]=='Carkeek Warmer'
                                                                       else row[0], axis=1)

## Save cleaned_df for future use without having to rerun dataprep file

In [94]:
#pickle the cleaned_df for future use without having to re-run dataprep.py 
#(this will override the version automatically saved) by dataprep.py with the one with the corrected zipcode
with open ('cleaned_df.pkl', 'w') as f:
    pickle.dump(cleaned_df, f)
    
#to load this file in the future, run command:
#with open('cleaned_df.pkl', 'rb') as f:
    #dataframe = pickle.load(f)

## Instantiate Class Instance for ALS Model, Prep and Run:

In [106]:
als_model = als_model.implicit_als(cleaned_df)

In [107]:
als_model.prep_spark_full_df()

DataFrame[PersonID: bigint, EventID: bigint, Participated: bigint, Event_Date: bigint, SeriesID: double, EventTypeID: bigint, Total_Fee_Avg: bigint, Miles2_Avg: bigint, Venue_Zip: bigint]

In [108]:
als_model.spark_full_df.show(5)

+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
|PersonID|EventID|Participated|         Event_Date|SeriesID|EventTypeID|Total_Fee_Avg|Miles2_Avg|Venue_Zip|
+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
|       1|     11|           1|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       2|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       3|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       4|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       5|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
only showing top 5 rows



In [109]:
als_model.train_val_test_split()

Train Size: 592450.0
Validation Size: 148113.0
Test Size: 185141.0


In [110]:
als_model.print_train_val_test_info("EventID")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


EventID in train: 56
EventID in validate: 14
EventID in test: 18


EventID in both train & validate: 1
EventID in both train & test: 0


In [111]:
als_model.create_participate_matrices("EventID")

In [112]:
als_model.run_ALS_TVS(event_param="EventID", scoring="r2")

Predictions on Training Data:
Predictions includes 592450 valid values and 0 nan values


Mean prediction is 0.0303024963926
Error of Type r2 = 0.317162022448
Predictions on Validation Data:
Predictions includes 10334 valid values and 137779 nan values


Mean prediction is 0.00201103141265
Error of Type r2 = -0.0142867802634


In [128]:
parammap = als_model.tvs_model.getEstimatorParamMaps()
avgmetrics = als_model.tvs_model.validationMetrics

zipped_tvs_model_info = zip(parammap, avgmetrics)

In [135]:
min(avgmetrics), max(avgmetrics)
#All combinations of parameters give the same validation number...how can this be?

(-0.14203634369376705, -0.14203634369376705)

## Save spark_full_df, bestmodel from ALS TVS run, and predictions on training data from that model

In [140]:
full_df_pd = als_model.spark_full_df.toPandas()
als_fitted_model_tvsr2 = als_model.tvs_bestmodel
als_trainpreds_tvsr2_pd = als_model.tvs_trainpreds.toPandas()

with open ("24OCT17_pipelinerun/full_df_pd.pkl", 'w') as f:
    pickle.dump(full_df_pd, f)

als_fitted_model_tvsr2.save("24OCT17_pipelinerun/als_fitted_model_tvsr2")

with open ("24OCT17_pipelinerun/als_trainpreds_tvsr2.pkl", 'w') as f:
    pickle.dump(als_trainpreds_tvsr2_pd, f)

#Note: to load the model saved above in future, run command:
#sameModel = GradientBoostedTreesModel.load("tvs_bestmodel_base")

In [206]:
als_train_df_pd = als_model.train.toPandas()
als_val_df_pd = als_model.validate.toPandas()
als_test_df_pd = als_model.test.toPandas()

In [207]:
with open ("24OCT17_pipelinerun/als_train_df_pd.pkl", 'w') as f:
    pickle.dump(als_train_df_pd, f)
    
with open ("24OCT17_pipelinerun/als_val_df_pd.pkl", 'w') as f:
    pickle.dump(als_val_df_pd, f)
    
with open ("24OCT17_pipelinerun/als_test_df_pd.pkl", 'w') as f:
    pickle.dump(als_test_df_pd, f)

## Import and run code to prepare training data for gradient boosted model

In [234]:
import prep_gbdata

In [235]:
gb_dataprep = prep_gbdata.RegressionDataPrep(spark_data_df=als_model.train, user_df=cleaned_df, datasplit='train', 
                                             als_predictions=als_model.tvs_trainpreds)

In [237]:
gb_dataprep.format_gb_data()

In [238]:
gb_dataprep.train_gb.head()

Unnamed: 0,Event_Date,Total_Fee_Avg,Miles2_Avg,AgeAvg,y_label,SeriesID_1.0,SeriesID_2.0,SeriesID_3.0,SeriesID_4.0,EventTypeID_2,...,Venue_Zip_98118,Venue_Zip_98177,Venue_Zip_98208,Venue_Zip_98239,Venue_Zip_98290,Venue_Zip_98332,Venue_Zip_98922,Venue_Zip_99032,Gender_Male,Gender_Other
0,1418428800000000000,39,8,36,0.000247,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
1,1418428800000000000,39,8,41,0.0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1418428800000000000,39,8,15,0.030626,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1418428800000000000,39,8,28,0.0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
4,1418428800000000000,39,8,20,0.0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [241]:
with open ("24OCT17_pipelinerun/gb_train_data.pkl", 'w') as f:
    pickle.dump(gb_dataprep.train_gb, f)

## Train Gradient Boosted Regressor out of the box using prepared data

In [242]:
from sklearn.ensemble import GradientBoostingRegressor

In [243]:
gb_model = GradientBoostingRegressor()

In [248]:
with open ("24OCT17_pipelinerun/gb_train_data.pkl", 'rb') as f:
    gb_train_data = pickle.load(f)

In [306]:
y = gb_train_data['y_label']
X = gb_train_data.drop('y_label', axis=1)

In [253]:
gb_model.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [309]:
gb_model.score(X, y)
#why is this so bad when run on the training data???  Note that this score method has 1.0 as the best possible score.
#could this be due to the imbalance in my data (vast majority of observations are close to zero)

0.024883492158110698

## Prepare Validation Data for Gradient Boosted Model

In [266]:
#prep validation data (note: need to go back and refit ALS model with train+validate once finalize it's tuning, then 
#will run the gb test on the final test data; for now using the validation data though...):
gb_valdataprep = prep_gbdata.RegressionDataPrep(spark_data_df=als_model.validate, user_df=cleaned_df, 
                                                datasplit='validate')

In [267]:
gb_valdataprep.format_gb_data()

In [269]:
with open ("24OCT17_pipelinerun/gb_val_data.pkl", 'w') as f:
    pickle.dump(gb_valdataprep.train_gb, f)

## Run Predict on Gradient Boosted Model Using Validation Data to Test Fit

In [270]:
y_val = gb_valdataprep.train_gb['y_label']
X_val = gb_valdataprep.train_gb.drop('y_label', axis=1)

In [287]:
cols_msg_in_val = set(X.columns) - set(X_val.columns)
cols_msg_in_val

set()

In [281]:
for col in cols_msg_in_val:
    X_val[col] = 0

In [289]:
set(X_val.columns) - set(X.columns)

{'Venue_Zip_98077'}

In [291]:
X_val.drop('Venue_Zip_98077', axis=1, inplace=True)

In [292]:
gb_predictions = gb_model.predict(X_val)

In [301]:
gb_model.score(X_val, y_val)

-0.032974955711363441