## Import ALS Model code

In [24]:
import als_model

In [2]:
%matplotlib inline

In [1]:
import cPickle as pickle

## Open cleaned_df

In [5]:
with open ('24OCT17_pipelinerun/cleaned_df.pkl', 'rb') as f:
    cleaned_data = pickle.load(f)

## Instantiate Class Instances for ALS Model, Prep and Run for the Following:

#### For each of the combinations below, run for the repeatable event features:
- SeriesID
- EventTypeID
- Venue (based on zip code)
- Total Average Fee
- Average Mileage

#### Combinations:
- Do single fits (not TVS since it's giving me unclear results)
- Do the following variations for EventID and for SeriesID to compare the one-off versus a repeatable event feature
    - Vary alpha (10, 40, 80)
    - Vary factors (5, 10, 30)
    - Set coldStartStrategy="drop" for all
    - Calculate rank for each
    
#### Save Best Model and Predictions for Each

## SeriesID

In [9]:
als_model_series = als_model.implicit_als(cleaned_data)

In [11]:
als_model_series.prep_spark_full_df()

DataFrame[PersonID: bigint, EventID: bigint, Participated: bigint, Event_Date: bigint, SeriesID: double, EventTypeID: bigint, Total_Fee_Avg: bigint, Miles2_Avg: bigint, Venue_Zip: bigint]

In [12]:
als_model_series.spark_full_df.show(5)

+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
|PersonID|EventID|Participated|         Event_Date|SeriesID|EventTypeID|Total_Fee_Avg|Miles2_Avg|Venue_Zip|
+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
|       1|     11|           1|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       2|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       3|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       4|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
|       5|     11|           0|1423958400000000000|     0.0|          1|           46|        12|    98239|
+--------+-------+------------+-------------------+--------+-----------+-------------+----------+---------+
only showing top 5 rows



In [13]:
als_model_series.train_val_test_split()

Train Size: 592450.0
Validation Size: 148113.0
Test Size: 185141.0


In [14]:
als_model_series.print_train_val_test_info("SeriesID")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


SeriesID in train: 5
SeriesID in validate: 4
SeriesID in test: 5


SeriesID in both train & validate: 4
SeriesID in both train & test: 5


In [15]:
als_model_series.create_participate_matrices("SeriesID")

In [16]:
als_model_series.fit_ALS(itemCol="SeriesID", rank=10, alpha=40, coldStartStrategy="drop")

In [29]:
als_model_series.predict_ALS(als_model_series.base_model, "SeriesID")

Predictions includes 148113 valid values and 0 nan values
Model Rank = 10.2427872329 and Popular Rank = 14.6053110222


(DataFrame[PersonID: bigint, SeriesID: double, Participated: bigint, Event_Date: bigint, prediction: float],
 10.24278723287488,
 14.605311022189888)

In [31]:
alphas = [10, 40, 80]
factors = [5, 10, 30]
results_list = []

for alpha in alphas:
    for factor in factors:
        als_model_series.fit_ALS(itemCol="SeriesID", rank=factor, alpha=alpha, coldStartStrategy="drop")
        predictions, val_rank, pop_rank = als_model_series.predict_ALS(als_model_series.base_model, "SeriesID")
        results_list.append((alpha, factor, val_rank, pop_rank))

Predictions includes 148113 valid values and 0 nan values
Model Rank = 11.0127835595 and Popular Rank = 14.6053110222
Predictions includes 148113 valid values and 0 nan values
Model Rank = 12.3649384219 and Popular Rank = 14.6053110222
Predictions includes 148113 valid values and 0 nan values
Model Rank = 12.2174568933 and Popular Rank = 14.6053110222
Predictions includes 148113 valid values and 0 nan values
Model Rank = 8.50024480868 and Popular Rank = 14.6053110222
Predictions includes 148113 valid values and 0 nan values
Model Rank = 10.2427872329 and Popular Rank = 14.6053110222
Predictions includes 148113 valid values and 0 nan values
Model Rank = 11.6650748295 and Popular Rank = 14.6053110222
Predictions includes 148113 valid values and 0 nan values
Model Rank = 7.36118642851 and Popular Rank = 14.6053110222
Predictions includes 148113 valid values and 0 nan values
Model Rank = 8.67923627881 and Popular Rank = 14.6053110222
Predictions includes 148113 valid values and 0 nan value

In [32]:
results_list

[(10, 5, 11.012783559453879, 14.605311022189888),
 (10, 10, 12.36493842192254, 14.605311022189888),
 (10, 30, 12.217456893322337, 14.605311022189888),
 (40, 5, 8.5002448086761344, 14.605311022189888),
 (40, 10, 10.24278723287488, 14.605311022189888),
 (40, 30, 11.665074829468944, 14.605311022189888),
 (80, 5, 7.3611864285068007, 14.605311022189888),
 (80, 10, 8.6792362788095865, 14.605311022189888),
 (80, 30, 10.726324881342887, 14.605311022189888)]

#### The best model amongst these uses alpha = 80 and factors = 5 for a rank score of 7.36% as compared to a popularity-based rank score of 14.61% (our model is twice as good as recommending things based on popularity!).  Refit this model and save:

In [33]:
als_model_series.fit_ALS(itemCol="SeriesID", rank=5, alpha=80, coldStartStrategy="drop")

In [34]:
predictions, val_rank, pop_rank = als_model_series.predict_ALS(als_model_series.base_model, "SeriesID")

Predictions includes 148113 valid values and 0 nan values
Model Rank = 7.36118642851 and Popular Rank = 14.6053110222


In [36]:
full_df_pd = als_model_series.spark_full_df.toPandas() #this only needs to be saved once today
als_fit_model_series = als_model_series.base_model
als_valpreds_pd_series = predictions.toPandas()

with open ("25OCT17_repeatablefeaturesrun/full_df_pd.pkl", 'w') as f:
    pickle.dump(full_df_pd, f)

als_fit_model_series.save("25OCT17_repeatablefeaturesrun/als_fit_model_series")

with open ("25OCT17_repeatablefeaturesrun/als_valpreds_pd_series.pkl", 'w') as f:
    pickle.dump(als_valpreds_pd_series, f)

#Note: to load the model saved above in future, run command:
#sameModel = GradientBoostedTreesModel.load("filename")

In [37]:
#these only need to be saved once today:
als_train_df_pd = als_model_series.train.toPandas()
als_val_df_pd = als_model_series.validate.toPandas()
als_test_df_pd = als_model_series.test.toPandas()

In [38]:
with open ("25OCT17_repeatablefeaturesrun/als_train_df_pd.pkl", 'w') as f:
    pickle.dump(als_train_df_pd, f)
    
with open ("25OCT17_repeatablefeaturesrun/als_val_df_pd.pkl", 'w') as f:
    pickle.dump(als_val_df_pd, f)
    
with open ("25OCT17_repeatablefeaturesrun/als_test_df_pd.pkl", 'w') as f:
    pickle.dump(als_test_df_pd, f)

## EventTypeID

In [42]:
als_model_eventtype = als_model.implicit_als(cleaned_data)

In [43]:
als_model_eventtype.prep_spark_full_df()

In [44]:
als_model_eventtype.train_val_test_split()

Train Size: 592450.0
Validation Size: 148113.0
Test Size: 185141.0


In [45]:
als_model_eventtype.print_train_val_test_info("EventTypeID")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


EventTypeID in train: 2
EventTypeID in validate: 1
EventTypeID in test: 2


EventTypeID in both train & validate: 1
EventTypeID in both train & test: 2


In [46]:
als_model_eventtype.create_participate_matrices("EventTypeID")

In [47]:
als_model_eventtype.fit_ALS(itemCol="EventTypeID", rank=10, alpha=40, coldStartStrategy="drop")

In [58]:
predictions, val_rank, pop_rank = als_model_eventtype.predict_ALS(als_model_eventtype.base_model, "EventTypeID")

Predictions includes 148113 valid values and 0 nan values
Model Rank = 100 and Popular Rank = 100


All predictions in this case are 0.0, meaning a very poor fit (rank function sets this to 100); realistically, there is no way to make top predictions for any user with this data.  It could be that this is just not a good feature to use; check grid search / cv below, but may throw this one out.

In [59]:
alphas = [10, 40, 80]
factors = [5, 10, 30]
results_list = []

for alpha in alphas:
    for factor in factors:
        als_model_eventtype.fit_ALS(itemCol="EventTypeID", rank=factor, alpha=alpha, coldStartStrategy="drop")
        predictions, val_rank, pop_rank = als_model_eventtype.predict_ALS(als_model_eventtype.base_model, "EventTypeID")
        results_list.append((alpha, factor, val_rank, pop_rank))

Predictions includes 148113 valid values and 0 nan values
Model Rank = 0.0 and Popular Rank = 0.0
Predictions includes 148113 valid values and 0 nan values
Model Rank = 0.0 and Popular Rank = 0.0
Predictions includes 148113 valid values and 0 nan values
Model Rank = 0.0 and Popular Rank = 0.0
Predictions includes 148113 valid values and 0 nan values
Model Rank = 0.0 and Popular Rank = 0.0
Predictions includes 148113 valid values and 0 nan values
Model Rank = 100 and Popular Rank = 100
Predictions includes 148113 valid values and 0 nan values
Model Rank = 0.0 and Popular Rank = 0.0
Predictions includes 148113 valid values and 0 nan values
Model Rank = 100 and Popular Rank = 100
Predictions includes 148113 valid values and 0 nan values
Model Rank = 100 and Popular Rank = 100
Predictions includes 148113 valid values and 0 nan values
Model Rank = 0.0 and Popular Rank = 0.0


In [60]:
results_list

[(10, 5, 0.0, 0.0),
 (10, 10, 0.0, 0.0),
 (10, 30, 0.0, 0.0),
 (40, 5, 0.0, 0.0),
 (40, 10, 100, 100),
 (40, 30, 0.0, 0.0),
 (80, 5, 100, 100),
 (80, 10, 100, 100),
 (80, 30, 0.0, 0.0)]

In [61]:
pd_test = predictions.toPandas()

In [68]:
pd_test['prediction'].describe()

count    148113.000000
mean          0.526357
std           0.494381
min           0.000000
25%           0.000000
50%           0.994047
75%           0.994047
max           1.005954
Name: prediction, dtype: float64

#### The best model amongst these is ambiguous given all of the zero ranks (and the fact that popularity-based model also gives a zero rank); choose alpha = 80 and factors = 30 for similarity to the SeriesID picks, amongst all of the zero rank models.  Note that this model looks to be potentially very overfit...keep in mind when doing gradient boosting - may still throw out (too closely correlated to 1/0 actuals?)
#### Refit this model and save:

In [69]:
als_model_eventtype.fit_ALS(itemCol="EventTypeID", rank=30, alpha=80, coldStartStrategy="drop")

In [70]:
predictions, val_rank, pop_rank = als_model_eventtype.predict_ALS(als_model_eventtype.base_model, "EventTypeID")

Predictions includes 148113 valid values and 0 nan values
Model Rank = 0.0 and Popular Rank = 0.0


In [71]:
als_fit_model_eventtype = als_model_eventtype.base_model
als_valpreds_pd_eventtype = predictions.toPandas()

als_fit_model_eventtype.save("25OCT17_repeatablefeaturesrun/als_fit_model_eventtype")

with open ("25OCT17_repeatablefeaturesrun/als_valpreds_pd_eventtype.pkl", 'w') as f:
    pickle.dump(als_valpreds_pd_eventtype, f)

## Venue (Based on Zipcode)

In [72]:
als_model_zipcode = als_model.implicit_als(cleaned_data)

In [73]:
als_model_zipcode.prep_spark_full_df()

In [74]:
als_model_zipcode.train_val_test_split()

Train Size: 592450.0
Validation Size: 148113.0
Test Size: 185141.0


In [76]:
als_model_zipcode.print_train_val_test_info("Venue_Zip")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


Venue_Zip in train: 24
Venue_Zip in validate: 10
Venue_Zip in test: 15


Venue_Zip in both train & validate: 8
Venue_Zip in both train & test: 15


In [77]:
als_model_zipcode.create_participate_matrices("Venue_Zip")

In [78]:
als_model_zipcode.fit_ALS(itemCol="Venue_Zip", rank=10, alpha=40, coldStartStrategy="drop")

In [79]:
predictions, val_rank, pop_rank = als_model_zipcode.predict_ALS(als_model_zipcode.base_model, "Venue_Zip")

Predictions includes 126585 valid values and 0 nan values
Model Rank = 14.7773883269 and Popular Rank = 34.8748473748


In [80]:
predictions.toPandas().head()

Unnamed: 0,PersonID,Venue_Zip,Participated,Event_Date,prediction
0,148,98177,0,1479513600000000000,0.902622
1,148,98177,0,1474675200000000000,0.902622
2,463,98177,0,1479513600000000000,0.027218
3,463,98177,0,1474675200000000000,0.027218
4,471,98177,0,1479513600000000000,0.100359


In [81]:
alphas = [10, 40, 80]
factors = [5, 10, 30]
results_list = []

for alpha in alphas:
    for factor in factors:
        als_model_zipcode.fit_ALS(itemCol="Venue_Zip", rank=factor, alpha=alpha, coldStartStrategy="drop")
        predictions, val_rank, pop_rank = als_model_zipcode.predict_ALS(als_model_zipcode.base_model, "Venue_Zip")
        results_list.append((alpha, factor, val_rank, pop_rank))

Predictions includes 126585 valid values and 0 nan values
Model Rank = 14.5271803243 and Popular Rank = 34.8748473748
Predictions includes 126585 valid values and 0 nan values
Model Rank = 16.1116372646 and Popular Rank = 34.8748473748
Predictions includes 126585 valid values and 0 nan values
Model Rank = 17.0623408565 and Popular Rank = 34.8748473748
Predictions includes 126585 valid values and 0 nan values
Model Rank = 12.8025525165 and Popular Rank = 34.8748473748
Predictions includes 126585 valid values and 0 nan values
Model Rank = 14.7773883269 and Popular Rank = 34.8748473748
Predictions includes 126585 valid values and 0 nan values
Model Rank = 16.7087761862 and Popular Rank = 34.8748473748
Predictions includes 126585 valid values and 0 nan values
Model Rank = 11.991154793 and Popular Rank = 34.8748473748
Predictions includes 126585 valid values and 0 nan values
Model Rank = 13.827741567 and Popular Rank = 34.8748473748
Predictions includes 126585 valid values and 0 nan values


In [82]:
results_list

[(10, 5, 14.527180324319232, 34.874847374847377),
 (10, 10, 16.111637264590311, 34.874847374847377),
 (10, 30, 17.062340856530334, 34.874847374847377),
 (40, 5, 12.802552516462956, 34.874847374847377),
 (40, 10, 14.777388326934982, 34.874847374847377),
 (40, 30, 16.708776186221169, 34.874847374847377),
 (80, 5, 11.991154793030798, 34.874847374847377),
 (80, 10, 13.827741567025113, 34.874847374847377),
 (80, 30, 15.763569588440212, 34.874847374847377)]

#### The best model amongst these uses alpha = 80 and factors = 5 for a rank score of 11.99% as compared to a popularity-based rank score of 34.87% (our model is three times as good as recommending things based on popularity!).  Refit this model and save:

In [83]:
als_model_zipcode.fit_ALS(itemCol="Venue_Zip", rank=5, alpha=80, coldStartStrategy="drop")

In [84]:
predictions, val_rank, pop_rank = als_model_zipcode.predict_ALS(als_model_zipcode.base_model, "Venue_Zip")

Predictions includes 126585 valid values and 0 nan values
Model Rank = 11.991154793 and Popular Rank = 34.8748473748


In [85]:
als_fit_model_zipcode = als_model_zipcode.base_model
als_valpreds_pd_zipcode = predictions.toPandas()

als_fit_model_zipcode.save("25OCT17_repeatablefeaturesrun/als_fit_model_zipcode")

with open ("25OCT17_repeatablefeaturesrun/als_valpreds_pd_zipcode.pkl", 'w') as f:
    pickle.dump(als_valpreds_pd_zipcode, f)

## Total Average Fee

In [86]:
als_model_avgfee = als_model.implicit_als(cleaned_data)

In [87]:
als_model_avgfee.prep_spark_full_df()

In [88]:
als_model_avgfee.train_val_test_split()

Train Size: 592450.0
Validation Size: 148113.0
Test Size: 185141.0


In [89]:
als_model_avgfee.print_train_val_test_info("Total_Fee_Avg")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


Total_Fee_Avg in train: 20
Total_Fee_Avg in validate: 11
Total_Fee_Avg in test: 12


Total_Fee_Avg in both train & validate: 8
Total_Fee_Avg in both train & test: 8


In [90]:
als_model_avgfee.create_participate_matrices("Total_Fee_Avg")

In [91]:
als_model_avgfee.fit_ALS(itemCol="Total_Fee_Avg", rank=10, alpha=40, coldStartStrategy="drop")

In [92]:
predictions, val_rank, pop_rank = als_model_avgfee.predict_ALS(als_model_avgfee.base_model, "Total_Fee_Avg")

Predictions includes 115821 valid values and 0 nan values
Model Rank = 17.0182112752 and Popular Rank = 21.788489558


In [93]:
predictions.toPandas().head()

Unnamed: 0,PersonID,Total_Fee_Avg,Participated,Event_Date,prediction
0,148,34,0,1483228800000000000,0.920413
1,463,34,0,1483228800000000000,0.095284
2,471,34,0,1483228800000000000,0.02315
3,496,34,0,1483228800000000000,0.949894
4,833,34,0,1483228800000000000,0.035321


In [94]:
alphas = [10, 40, 80]
factors = [5, 10, 30]
results_list = []

for alpha in alphas:
    for factor in factors:
        als_model_avgfee.fit_ALS(itemCol="Total_Fee_Avg", rank=factor, alpha=alpha, coldStartStrategy="drop")
        predictions, val_rank, pop_rank = als_model_avgfee.predict_ALS(als_model_avgfee.base_model, "Total_Fee_Avg")
        results_list.append((alpha, factor, val_rank, pop_rank))

Predictions includes 115821 valid values and 0 nan values
Model Rank = 19.5897912984 and Popular Rank = 21.788489558
Predictions includes 115821 valid values and 0 nan values
Model Rank = 21.0296417448 and Popular Rank = 21.788489558
Predictions includes 115821 valid values and 0 nan values
Model Rank = 21.4842829622 and Popular Rank = 21.788489558
Predictions includes 115821 valid values and 0 nan values
Model Rank = 17.6040700144 and Popular Rank = 21.788489558
Predictions includes 115821 valid values and 0 nan values
Model Rank = 17.0182112752 and Popular Rank = 21.788489558
Predictions includes 115821 valid values and 0 nan values
Model Rank = 20.0773289617 and Popular Rank = 21.788489558
Predictions includes 115821 valid values and 0 nan values
Model Rank = 14.5327851872 and Popular Rank = 21.788489558
Predictions includes 115821 valid values and 0 nan values
Model Rank = 15.1718778364 and Popular Rank = 21.788489558
Predictions includes 115821 valid values and 0 nan values
Model 

In [95]:
results_list

[(10, 5, 19.589791298439604, 21.788489558037881),
 (10, 10, 21.02964174478463, 21.788489558037881),
 (10, 30, 21.484282962217193, 21.788489558037881),
 (40, 5, 17.604070014440126, 21.788489558037881),
 (40, 10, 17.018211275202308, 21.788489558037881),
 (40, 30, 20.077328961664726, 21.788489558037881),
 (80, 5, 14.532785187242714, 21.788489558037881),
 (80, 10, 15.171877836403693, 21.788489558037881),
 (80, 30, 20.69396794064744, 21.788489558037881)]

#### The best model amongst these uses alpha = 80 and factors = 5 for a rank score of 14.53% as compared to a popularity-based rank score of 21.79% (our model is almost twice as good as recommending things based on popularity!).  Refit this model and save:

In [96]:
als_model_avgfee.fit_ALS(itemCol="Total_Fee_Avg", rank=5, alpha=80, coldStartStrategy="drop")

In [97]:
predictions, val_rank, pop_rank = als_model_avgfee.predict_ALS(als_model_avgfee.base_model, "Total_Fee_Avg")

Predictions includes 115821 valid values and 0 nan values
Model Rank = 14.5327851872 and Popular Rank = 21.788489558


In [98]:
als_fit_model_avgfee = als_model_avgfee.base_model
als_valpreds_pd_avgfee = predictions.toPandas()

als_fit_model_avgfee.save("25OCT17_repeatablefeaturesrun/als_fit_model_avgfee")

with open ("25OCT17_repeatablefeaturesrun/als_valpreds_pd_avgfee.pkl", 'w') as f:
    pickle.dump(als_valpreds_pd_avgfee, f)

## Average Mileage

In [99]:
als_model_avgmile = als_model.implicit_als(cleaned_data)

In [100]:
als_model_avgmile.prep_spark_full_df()

In [101]:
als_model_avgmile.train_val_test_split()

Train Size: 592450.0
Validation Size: 148113.0
Test Size: 185141.0


In [102]:
als_model_avgmile.print_train_val_test_info("Miles2_Avg")

participants in train: 10764
participants in validate: 10764
participants in test: 10764


participants in both train & validate: 10764
participants in both train & test: 10764


Miles2_Avg in train: 11
Miles2_Avg in validate: 5
Miles2_Avg in test: 10


Miles2_Avg in both train & validate: 5
Miles2_Avg in both train & test: 7


In [103]:
als_model_avgmile.create_participate_matrices("Miles2_Avg")

In [104]:
als_model_avgmile.fit_ALS(itemCol="Miles2_Avg", rank=10, alpha=40, coldStartStrategy="drop")

In [105]:
predictions, val_rank, pop_rank = als_model_avgmile.predict_ALS(als_model_avgmile.base_model, "Miles2_Avg")

Predictions includes 148113 valid values and 0 nan values
Model Rank = 12.5909255867 and Popular Rank = 29.1960712987


In [106]:
predictions.toPandas().head()

Unnamed: 0,PersonID,Miles2_Avg,Participated,Event_Date,prediction
0,148,5,0,1483228800000000000,0.936979
1,148,5,0,1481932800000000000,0.936979
2,148,5,0,1478304000000000000,0.936979
3,463,5,0,1483228800000000000,0.105175
4,463,5,0,1481932800000000000,0.105175


In [107]:
alphas = [10, 40, 80]
factors = [5, 10, 30]
results_list = []

for alpha in alphas:
    for factor in factors:
        als_model_avgmile.fit_ALS(itemCol="Miles2_Avg", rank=factor, alpha=alpha, coldStartStrategy="drop")
        predictions, val_rank, pop_rank = als_model_avgmile.predict_ALS(als_model_avgmile.base_model, "Miles2_Avg")
        results_list.append((alpha, factor, val_rank, pop_rank))

Predictions includes 148113 valid values and 0 nan values
Model Rank = 15.270046008 and Popular Rank = 29.1960712987
Predictions includes 148113 valid values and 0 nan values
Model Rank = 15.8309030651 and Popular Rank = 29.1960712987
Predictions includes 148113 valid values and 0 nan values
Model Rank = 14.6571414199 and Popular Rank = 29.1960712987
Predictions includes 148113 valid values and 0 nan values
Model Rank = 12.9935240924 and Popular Rank = 29.1960712987
Predictions includes 148113 valid values and 0 nan values
Model Rank = 12.5909255867 and Popular Rank = 29.1960712987
Predictions includes 148113 valid values and 0 nan values
Model Rank = 13.6516023017 and Popular Rank = 29.1960712987
Predictions includes 148113 valid values and 0 nan values
Model Rank = 10.8299333015 and Popular Rank = 29.1960712987
Predictions includes 148113 valid values and 0 nan values
Model Rank = 12.582789844 and Popular Rank = 29.1960712987
Predictions includes 148113 valid values and 0 nan values


In [108]:
results_list

[(10, 5, 15.270046007997625, 29.196071298654054),
 (10, 10, 15.83090306511818, 29.196071298654054),
 (10, 30, 14.657141419873492, 29.196071298654054),
 (40, 5, 12.993524092357028, 29.196071298654054),
 (40, 10, 12.590925586746046, 29.196071298654054),
 (40, 30, 13.651602301725573, 29.196071298654054),
 (80, 5, 10.829933301479221, 29.196071298654054),
 (80, 10, 12.582789844046216, 29.196071298654054),
 (80, 30, 13.010303727315177, 29.196071298654054)]

#### The best model amongst these uses alpha = 80 and factors = 5 for a rank score of 10.83% as compared to a popularity-based rank score of 29.20% (our model is almost three times as good as recommending things based on popularity!).  Refit this model and save:

In [109]:
als_model_avgmile.fit_ALS(itemCol="Miles2_Avg", rank=5, alpha=80, coldStartStrategy="drop")

In [110]:
predictions, val_rank, pop_rank = als_model_avgmile.predict_ALS(als_model_avgmile.base_model, "Miles2_Avg")

Predictions includes 148113 valid values and 0 nan values
Model Rank = 10.8299333015 and Popular Rank = 29.1960712987


In [111]:
als_fit_model_avgmile = als_model_avgmile.base_model
als_valpreds_pd_avgmile = predictions.toPandas()

als_fit_model_avgmile.save("25OCT17_repeatablefeaturesrun/als_fit_model_avgmile")

with open ("25OCT17_repeatablefeaturesrun/als_valpreds_pd_avgmile.pkl", 'w') as f:
    pickle.dump(als_valpreds_pd_avgmile, f)

## Build Gradient Boost Ensemble Using ALS Models Predictions as Inputs to Determine Weights for Each

#### DO LATER - NEED TO ALTER CODE TO SUPPORT: Rerun Each ALS with Train + Validate Data for Final Fitted Model & Get Predictions on Dataset

#### Build X, y Matrices for GB Model (for now using predictions on validation data only)

In [63]:
with open ('25OCT17_repeatablefeaturesrun/als_val_df_pd.pkl', 'rb') as f:
    als_val_df_pd = pickle.load(f)

In [64]:
als_val_df_pd.head()

Unnamed: 0,PersonID,EventID,Participated,Event_Date,SeriesID,EventTypeID,Total_Fee_Avg,Miles2_Avg,Venue_Zip
0,4695,67,0,1486252800000000000,1.0,1,30,4,98102
1,4696,67,0,1486252800000000000,1.0,1,30,4,98102
2,2299,67,0,1486252800000000000,1.0,1,30,4,98102
3,4697,67,0,1486252800000000000,1.0,1,30,4,98102
4,4698,67,0,1486252800000000000,1.0,1,30,4,98102


In [65]:
with open ('25OCT17_repeatablefeaturesrun/als_valpreds_pd_series.pkl', 'rb') as f:
    als_valpreds_pd_series = pickle.load(f)

In [66]:
als_valpreds_pd_series.head()

Unnamed: 0,PersonID,SeriesID,Participated,Event_Date,prediction
0,148,1.0,0,1486252800000000000,0.984482
1,148,1.0,0,1484956800000000000,0.984482
2,148,1.0,0,1483228800000000000,0.984482
3,148,1.0,0,1481932800000000000,0.984482
4,148,1.0,0,1480723200000000000,0.984482


In [67]:
import pandas as pd
gb_data = pd.merge(als_val_df_pd, als_valpreds_pd_series,
                            how='left', on=['PersonID', 'Participated', 'Event_Date', 'SeriesID'])
gb_data['series_prediction'] = gb_data['prediction']
gb_data.drop(['prediction', 'SeriesID'], axis = 1, inplace=True)

In [68]:
gb_data.head()

Unnamed: 0,PersonID,EventID,Participated,Event_Date,EventTypeID,Total_Fee_Avg,Miles2_Avg,Venue_Zip,series_prediction
0,4695,67,0,1486252800000000000,1,30,4,98102,0.215541
1,4696,67,0,1486252800000000000,1,30,4,98102,0.213122
2,2299,67,0,1486252800000000000,1,30,4,98102,0.963359
3,4697,67,0,1486252800000000000,1,30,4,98102,0.213122
4,4698,67,0,1486252800000000000,1,30,4,98102,0.213122


In [69]:
with open('25OCT17_repeatablefeaturesrun/als_valpreds_pd_eventtype.pkl', 'rb') as f:
    als_valpreds_pd_eventtype = pickle.load(f)

In [70]:
als_valpreds_pd_eventtype.head()

Unnamed: 0,PersonID,EventTypeID,Participated,Event_Date,prediction
0,148,1,0,1486252800000000000,1.003905
1,148,1,0,1484956800000000000,1.003905
2,148,1,0,1483228800000000000,1.003905
3,148,1,0,1481932800000000000,1.003905
4,148,1,0,1480723200000000000,1.003905


In [71]:
gb_data = pd.merge(gb_data, als_valpreds_pd_eventtype,
                            how='left', on=['PersonID', 'Participated', 'Event_Date', 'EventTypeID'])
gb_data['eventtype_prediction'] = gb_data['prediction']
gb_data.drop(['prediction', 'EventTypeID'], axis = 1, inplace=True)

In [72]:
with open('25OCT17_repeatablefeaturesrun/als_valpreds_pd_zipcode.pkl', 'rb') as f:
    als_valpreds_pd_zipcode = pickle.load(f)

In [73]:
als_valpreds_pd_zipcode.head()

Unnamed: 0,PersonID,Venue_Zip,Participated,Event_Date,prediction
0,148,98177,0,1479513600000000000,0.881846
1,148,98177,0,1474675200000000000,0.881846
2,463,98177,0,1479513600000000000,0.329434
3,463,98177,0,1474675200000000000,0.329434
4,471,98177,0,1479513600000000000,0.888514


In [74]:
gb_data = pd.merge(gb_data, als_valpreds_pd_zipcode,
                            how='left', on=['PersonID', 'Participated', 'Event_Date', 'Venue_Zip'])
gb_data['venuezip_prediction'] = gb_data['prediction']
gb_data.drop(['prediction', 'Venue_Zip'], axis = 1, inplace=True)

In [75]:
with open('25OCT17_repeatablefeaturesrun/als_valpreds_pd_avgfee.pkl', 'rb') as f:
    als_valpreds_pd_avgfee = pickle.load(f)

In [76]:
als_valpreds_pd_avgfee.head()

Unnamed: 0,PersonID,Total_Fee_Avg,Participated,Event_Date,prediction
0,148,34,0,1483228800000000000,0.925915
1,463,34,0,1483228800000000000,0.041237
2,471,34,0,1483228800000000000,0.008612
3,496,34,0,1483228800000000000,0.944936
4,833,34,0,1483228800000000000,0.011816


In [77]:
gb_data = pd.merge(gb_data, als_valpreds_pd_avgfee,
                            how='left', on=['PersonID', 'Participated', 'Event_Date', 'Total_Fee_Avg'])
gb_data['fee_prediction'] = gb_data['prediction']
gb_data.drop(['prediction', 'Total_Fee_Avg'], axis = 1, inplace=True)

In [78]:
with open('25OCT17_repeatablefeaturesrun/als_valpreds_pd_avgmile.pkl', 'rb') as f:
    als_valpreds_pd_avgmile = pickle.load(f)

In [79]:
als_valpreds_pd_avgmile.head()

Unnamed: 0,PersonID,Miles2_Avg,Participated,Event_Date,prediction
0,148,5,0,1483228800000000000,0.933645
1,148,5,0,1481932800000000000,0.933645
2,148,5,0,1478304000000000000,0.933645
3,463,5,0,1483228800000000000,0.132972
4,463,5,0,1481932800000000000,0.132972


In [80]:
gb_data = pd.merge(gb_data, als_valpreds_pd_avgmile,
                            how='left', on=['PersonID', 'Participated', 'Event_Date', 'Miles2_Avg'])
gb_data['miles_prediction'] = gb_data['prediction']
gb_data.drop(['prediction', 'Miles2_Avg'], axis = 1, inplace=True)

In [81]:
gb_data.sort_values(by='Event_Date', axis=0, ascending=True, inplace=True)

In [83]:
gb_data.reset_index(drop=True, inplace=True)

In [84]:
gb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148113 entries, 0 to 148112
Data columns (total 9 columns):
PersonID                148113 non-null int64
EventID                 148113 non-null int64
Participated            148113 non-null int64
Event_Date              148113 non-null int64
series_prediction       148113 non-null float64
eventtype_prediction    148113 non-null float64
venuezip_prediction     126585 non-null float64
fee_prediction          115821 non-null float64
miles_prediction        148113 non-null float64
dtypes: float64(5), int64(4)
memory usage: 10.2 MB


In [85]:
#need to fill nan values for venue_zip_prediction and fee_prediction
gb_data['venuezip_prediction'] = gb_data['venuezip_prediction'].apply(
                            lambda x: gb_data['venuezip_prediction'].mean() if pd.isnull(x) else x)
#need to fill nan values for venue_zip_prediction and fee_prediction
gb_data['fee_prediction'] = gb_data['fee_prediction'].apply(
                            lambda x: gb_data['fee_prediction'].mean() if pd.isnull(x) else x)

In [86]:
gb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148113 entries, 0 to 148112
Data columns (total 9 columns):
PersonID                148113 non-null int64
EventID                 148113 non-null int64
Participated            148113 non-null int64
Event_Date              148113 non-null int64
series_prediction       148113 non-null float64
eventtype_prediction    148113 non-null float64
venuezip_prediction     148113 non-null float64
fee_prediction          148113 non-null float64
miles_prediction        148113 non-null float64
dtypes: float64(5), int64(4)
memory usage: 10.2 MB


In [87]:
len(gb_data)*.8, len(gb_data)*.2

(118490.40000000001, 29622.600000000002)

Split data on first 118491 records for train (sorted by event_date as timestamp) and last 29622 records for test

In [88]:
X_gb_train = gb_data[['series_prediction', 'eventtype_prediction', 'venuezip_prediction', 'fee_prediction', 
                      'miles_prediction']].iloc[:118491].copy()

In [95]:
y_gb_train = gb_data['Participated'].iloc[:118491].copy()

In [96]:
len(X_gb_train), len(y_gb_train)

(118491, 118491)

In [97]:
X_gb_test = gb_data[['series_prediction', 'eventtype_prediction', 'venuezip_prediction', 'fee_prediction', 
                      'miles_prediction']].iloc[118491:].copy()

In [98]:
y_gb_test = gb_data['Participated'].iloc[118491:].copy()

In [99]:
len(X_gb_test), len(y_gb_test)

(29622, 29622)

In [100]:
with open('25OCT17_repeatablefeaturesrun/gb_data.pkl', 'w') as f:
    pickle.dump(gb_data, f)

## Train Gradient Boosted Regressor out of the box using prepared data

In [101]:
from sklearn.ensemble import GradientBoostingRegressor

In [102]:
gb_model = GradientBoostingRegressor()

In [103]:
gb_model.fit(X_gb_train, y_gb_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [108]:
gb_model.score(X_gb_test, y_gb_test) #this uses r^2 for scoring....not helpful in my situation

0.028653648841104103

In [109]:
import rank_eval

The df needs to include columns for user, item, actual outcome, and predicted outcome
    user, item, actual, prediction - strings indicating the name of each column in
    the predictions file for use in the function

In [110]:
gb_predictions = gb_model.predict(X_gb_test)

In [121]:
gb_data.tail()

Unnamed: 0,PersonID,EventID,Participated,Event_Date,series_prediction,eventtype_prediction,venuezip_prediction,fee_prediction,miles_prediction
148108,3740,67,0,1486252800000000000,0.963359,0.994047,0.147309,0.639008,0.740542
148109,7650,67,0,1486252800000000000,0.0,0.0,0.0,0.0,0.0
148110,458,67,0,1486252800000000000,0.228323,0.994047,0.147309,0.14781,0.740542
148111,7643,67,0,1486252800000000000,0.0,0.0,0.0,0.0,0.0
148112,4695,67,0,1486252800000000000,0.215541,0.017411,0.0,0.336714,0.227744


In [122]:
X_gb_test.tail()

Unnamed: 0,series_prediction,eventtype_prediction,venuezip_prediction,fee_prediction,miles_prediction
148108,0.963359,0.994047,0.147309,0.639008,0.740542
148109,0.0,0.0,0.0,0.0,0.0
148110,0.228323,0.994047,0.147309,0.14781,0.740542
148111,0.0,0.0,0.0,0.0,0.0
148112,0.215541,0.017411,0.0,0.336714,0.227744


In [126]:
rankeval_testpreds_df = pd.DataFrame({'PersonID': gb_data['PersonID'].iloc[118491:], 'EventID': 
                                      gb_data['EventID'].iloc[118491:], 'Participated': y_gb_test.values, 
                                      'prediction': gb_predictions})

In [137]:
rankeval_testpreds_df.head()

Unnamed: 0,EventID,Participated,PersonID,prediction
118491,64,0,3501,0.004708
118492,64,0,3499,0.004708
118493,64,0,3498,0.004708
118494,64,0,3497,0.004708
118495,64,0,3514,0.001085


In [128]:
rank_processing = rank_eval.RankEval(rankeval_testpreds_df, 'PersonID', 'EventID', 'Participated', 'prediction')

In [129]:
rank_processing.calc_test_rank()

6.7685320254789945

In [130]:
rank_processing.calc_popular_rank()

30.009633911368006

In [131]:
gb_model.get_params()

{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

In [134]:
zip(gb_model.feature_importances_, X_gb_train.columns.values)

[(0.25867513383145935, 'series_prediction'),
 (0.24726754109597163, 'eventtype_prediction'),
 (0.21314398993566183, 'venuezip_prediction'),
 (0.12372515314074169, 'fee_prediction'),
 (0.15718818199616572, 'miles_prediction')]

## Next Steps:
- Alter Code to Support and Rerun Each ALS with Train + Validate Data for Final Fitted Model & Get Predictions on larger dataset; run another test on test data held out previously
- Rerun gradient boosted model using train+val larger dataset to train and test set to test
- Write code to look at events recommended most often in top 10 recommendations
- Write code to look at the users most likely to be interested in a given event (get list of users for which event is listed in their top 10 recommendations)
- Make graphs for presentation showing how I fit my ALS models and the value of the rank eval metric