In [1]:
from pymongo import MongoClient
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from ggplot import *
from sklearn.externals import joblib

In [2]:
import pyspark
from pyspark.sql.types import *
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
# Build our Spark Session and Context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext
spark, sc

(<pyspark.sql.session.SparkSession at 0x7ff20fd8a950>,
 <pyspark.context.SparkContext at 0x7ff20fccd910>)

In [4]:
# load data frame from csv
routes_df = pd.read_csv("routes_df.csv", sep='\t').drop('Unnamed: 0', axis=1)
users_df = pd.read_csv("users_df.csv", sep='\t').drop('Unnamed: 0', axis=1)

In [5]:
# using blake herrington because he's an author of climbing books
user_info = users_df[users_df['name'] == 'blakeherrington']
user_info = user_info.reset_index().drop(['index', 'name', 'id'], axis=1)
user_info.head()

Unnamed: 0,age,compliments,likes_gym,likes_sport,likes_tr,likes_trad,member_since,point_rank,total_points,aid_follows_ C0,...,state_ or,state_ pa,state_ ri,state_ tx,state_ ut,state_ va,state_ wa,state_ wy,male,female
0,30.0,0,1,1,1,1,2006-12-12,691.0,990,0,...,0,0,0,0,0,0,0,0,1,0


## AlS model predictions

In [6]:
data = pd.DataFrame(columns=['route_id', 'user_id'])
data['user_id'] = (0 * routes_df['id']) + 560 
data['route_id'] = routes_df['id']
one_row_spark_df = spark.createDataFrame(data)

In [7]:
path = '../data/alsmodel_val'
recommender = ALSModel.load(path)

In [8]:
# Get the recommender's prediction
recs = recommender.transform(one_row_spark_df)
recs.show()

+--------+-------+----------+
|route_id|user_id|prediction|
+--------+-------+----------+
|       0|    560| 0.7462977|
|       1|    560| 0.7462977|
|       2|    560| 1.2191802|
|       3|    560| 2.0401955|
|       4|    560| 1.4703317|
|       5|    560| 1.7931719|
|       6|    560| 1.4951665|
|       7|    560| 2.2380855|
|       8|    560|0.99331856|
|       9|    560| 3.3571277|
|      10|    560|   1.67003|
|      11|    560| 2.1294944|
|      12|    560| 1.3065122|
|      13|    560|       NaN|
|      14|    560| 1.5387775|
|      15|    560| 2.2380855|
|      16|    560|  1.282417|
|      17|    560| 1.6528205|
|      18|    560| 2.7041388|
|      19|    560|       NaN|
+--------+-------+----------+
only showing top 20 rows



## Gradient Boosting Predictions

In [12]:
gb = joblib.load('../pickle/gb_model_val.pkl') 

In [13]:
gb_predictions = []
for route_id in routes_df['id']:
    df1 = routes_df[routes_df['id'] == route_id]
    df1 = df1.reset_index().drop(['index', 'id'], axis=1)
    df1 = user_info.join(df1).drop('member_since', axis=1)
    gb_predictions.append(gb.predict(df1))

In [14]:
recs_df = recs.toPandas().drop_duplicates(subset='route_id')
recs_df['weighted'] = (0.8 * recs_df['prediction']) + (0.2 * pd.DataFrame(gb_predictions)[0])
# weight based on model 
weight1 = 0.16161616161616163
normalized_rating_count = routes_df['num_reviews'] / float(routes_df['num_reviews'].max())
recs_df['weighted2'] = recs_df['weighted'] + (weight1 * normalized_rating_count).tolist()
recs_df.sort_values('weighted2', ascending=False).head(6)

Unnamed: 0,route_id,user_id,prediction,weighted,weighted2
3370,3370,560,4.476171,4.060048,4.146356
1979,1979,560,4.476171,4.071746,4.086977
1022,1022,560,4.476171,4.079655,4.085578
595,595,560,4.476171,4.001966,4.04512
2539,2539,560,4.476171,4.02743,4.035891
3588,3588,560,4.150783,3.896087,3.900318
