In [20]:
from pymongo import MongoClient
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from ggplot import *
from sklearn.externals import joblib

In [7]:
import pyspark
from pyspark.sql.types import *
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

In [8]:
# Build our Spark Session and Context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext
spark, sc

(<pyspark.sql.session.SparkSession at 0x7f8b5f4167d0>,
 <pyspark.context.SparkContext at 0x7f8b8c7cc8d0>)

In [11]:
# load data frame from csv
routes_df = pd.read_csv("routes_df.csv", sep='\t').drop('Unnamed: 0', axis=1)
users_df = pd.read_csv("users_df.csv", sep='\t').drop('Unnamed: 0', axis=1)

In [16]:
# using blake herrington because he's an author of climbing books
user_info = users_df[users_df['name'] == 'blakeherrington']
user_info = user_info.reset_index().drop(['index', 'name', 'id'], axis=1)
user_info.head()

Unnamed: 0,age,compliments,likes_gym,likes_sport,likes_tr,likes_trad,member_since,point_rank,total_points,aid_follows_ C0,...,state_ or,state_ pa,state_ ri,state_ tx,state_ ut,state_ va,state_ wa,state_ wy,male,female
0,30.0,0,1,1,1,1,2006-12-12,691.0,990,0,...,0,0,0,0,0,0,0,0,1,0


## AlS model predictions

In [14]:
data = pd.DataFrame(columns=['route_id', 'user_id'])
data['user_id'] = (0 * routes_df['id']) + 560 
data['route_id'] = routes_df['id']
one_row_spark_df = spark.createDataFrame(data)

In [15]:
path = '/home/david/work/project/Rock-Climbing-Route-Recommender/src/alsmodel2'
recommender = ALSModel.load(path)

In [17]:
# Get the recommender's prediction
recs = recommender.transform(one_row_spark_df)
recs.show()

+--------+-------+----------+
|route_id|user_id|prediction|
+--------+-------+----------+
|       0|    560|0.66706514|
|       1|    560|0.66706514|
|       2|    560| 1.0090904|
|       3|    560|  2.061585|
|       4|    560| 1.3083812|
|       5|    560| 1.7657901|
|       6|    560| 1.4877522|
|       7|    560| 2.1482205|
|       8|    560| 0.9607993|
|       9|    560| 3.2223313|
|      10|    560| 1.4838066|
|      11|    560|  2.020525|
|      12|    560|  1.420278|
|      13|    560|       NaN|
|      14|    560| 1.2512287|
|      15|    560| 2.1482205|
|      16|    560| 1.5004058|
|      17|    560| 1.7346239|
|      18|    560| 2.7071037|
|      19|    560|       NaN|
+--------+-------+----------+
only showing top 20 rows



## Gradient Boosting Predictions

In [22]:
gb = joblib.load('../pickle/gb_model.pkl') 

In [23]:
gb_predictions = []
for route_id in routes_df['id']:
    df1 = routes_df[routes_df['id'] == route_id]
    df1 = df1.reset_index().drop(['index', 'id'], axis=1)
    df1 = user_info.join(df1).drop('member_since', axis=1)
    gb_predictions.append(gb.predict(df1))

In [25]:
recs_df = recs.toPandas().drop_duplicates(subset='route_id')
recs_df['weighted'] = (0.8 * recs_df['prediction']) + (0.2 * pd.DataFrame(gb_predictions)[0])
# weight based on model 
weight1 = 0.16161616161616163
normalized_rating_count = routes_df['num_reviews'] / float(routes_df['num_reviews'].max())
recs_df['weighted2'] = recs_df['weighted'] + (weight1 * normalized_rating_count).tolist()
recs_df.sort_values('weighted2', ascending=False).head(6)

Unnamed: 0,route_id,user_id,prediction,weighted,weighted2
3369,3369,560,4.296441,4.111942,4.380576
595,595,560,4.296441,3.931984,4.066301
3390,3390,560,3.834976,3.746004,4.014638
1978,1978,560,4.296441,3.954001,4.001407
1384,1384,560,4.146386,3.871374,3.987256
2538,2538,560,4.296441,3.957066,3.983403
