In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommender').getOrCreate()
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [None]:
# RS with pyspark

In [2]:
import pandas as pd
#pandas dataframe
reviews = pd.read_csv('data/upload/all_reviews_through_SA_class.csv')
#split the dataframe
datatest, datatrain  = reviews.sample(frac =.2), reviews.sample(frac=.8)
#convert to pyspark dataframe
DataTrain = spark.createDataFrame(datatrain)
DataTest = spark.createDataFrame(datatest)

In [3]:
def RS(thecolumn, datatrain, datatest):
    als = ALS(userCol='userId', itemCol='itemId', 
              ratingCol=thecolumn, nonnegative=True,
              )
    #Tune model using ParamGridBuilder
    param_grid = ParamGridBuilder()\
                .addGrid(als.rank,[10,11,12])\
                .addGrid(als.maxIter,[22,21,20])\
                .addGrid(als.regParam, [.07, .05, .1])\
                .build()
    #Define evaluator as RMSE
    evaluator_ = RegressionEvaluator(metricName="rmse",labelCol=thecolumn,
                                   predictionCol = "prediction")
    #Build cross validation using TrainvalidationSplit
    tvs = TrainValidationSplit(
        estimator = als,
        estimatorParamMaps = param_grid,
        evaluator=evaluator_)
    #Fit ALS model to training data
    model = tvs.fit(datatrain)
    #Extract best model from the tuning exercise using ParamGridBuilder
    bmodel = model.bestModel
    # Generate Predictions and evaluate using RMSE
    predictions = bmodel.transform(datatest)
    predictions = predictions.na.drop()
    rank = bmodel.rank
    rmse = evaluator_.evaluate(predictions)
    mIter = bmodel._java_obj.parent().getMaxIter()
    rParam = bmodel._java_obj.parent().getRegParam()
    return (predictions, bmodel, rmse, rank, mIter, rParam)

In [4]:
prediction_rating, bmodel_rat, rmse_rat, rank_rat, mIter_rat, rParam_rat = RS('rating', 
                                                                              DataTrain, DataTest)

In [5]:
prediction_tes, bmodel_tes, rmse_tes, rank_tes, mIter_tes, rParam_tes  = RS('testimony_class', 
                                                                            DataTrain, DataTest)

In [6]:
print("RMSE of prediction rating is " + str(round(rmse_rat, 3)))
print("best model :")
print("rank = " + str(rank_rat))
print("maxIter = " + str(mIter_rat))
print("regParam = " + str(rParam_rat))
print("RMSE of prediction testimony class is " + str(round(rmse_tes, 3)))
print("best model :")
print("rank = " + str(rank_tes))
print("maxIter = " + str(mIter_tes))
print("regParam = " + str(rParam_tes))

RMSE of prediction rating is 0.325
best model :
rank = 10
maxIter = 22
regParam = 0.07
RMSE of prediction testimony class is 0.267
best model :
rank = 10
maxIter = 22
regParam = 0.07


In [7]:
user_recs_tc_top5 = bmodel_tes.recommendForAllUsers(5)
user_recs_rat_top5 = bmodel_rat.recommendForAllUsers(5)

In [8]:
user_recs_tc_top5.show(3)
user_recs_rat_top5.show(3)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  1580|[[18, 3.1919055],...|
|   471|[[39, 3.2942615],...|
|  1591|[[18, 3.3876348],...|
+------+--------------------+
only showing top 3 rows

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  1580|[[3, 5.130658], [...|
|   471|[[2, 6.361953], [...|
|  1591|[[2, 4.6105685], ...|
+------+--------------------+
only showing top 3 rows



In [33]:
user_recs_tc_top5.select(['recommendations']).filter(
    user_recs_tc_top5.userId == 1580).first()

Row(recommendations=[Row(itemId=18, rating=3.1919054985046387), Row(itemId=16, rating=2.926283597946167), Row(itemId=17, rating=2.884603261947632), Row(itemId=35, rating=2.770015239715576), Row(itemId=7, rating=2.668339967727661)])

In [34]:
user_recs_rat_top5.select(['recommendations']).filter(
    user_recs_rat_top5.userId == 1580).first()

Row(recommendations=[Row(itemId=3, rating=5.130658149719238), Row(itemId=13, rating=4.974915981292725), Row(itemId=18, rating=4.963583469390869), Row(itemId=2, rating=4.9153289794921875), Row(itemId=16, rating=4.903608798980713)])

In [11]:
import numpy as np

In [35]:
TopRecBasedOnTestimony = pd.DataFrame(np.array([[18, 5], [16, 4], [17,3], [35,2], [7,1]]),
                                              columns=['itemId', 'points_tes'])
TopRecBasedOnRating= pd.DataFrame(np.array([[3, 5], [13, 4], [18,3], [2,2], [16,1]]),
                                              columns=['itemId', 'points_rat'])

In [36]:
topRecommendations = pd.merge(
    TopRecBasedOnTestimony, TopRecBasedOnRating,
    on='itemId', how='outer').fillna(0)

In [37]:
# calculate the result
topRecommendations['Result'] = topRecommendations.apply(
    lambda row: (row['points_rat']+row['points_tes'])/2,
    axis=1
)
topRecommendations = topRecommendations.sort_values(by=['Result'],ascending=[0])
topRecommendations

Unnamed: 0,itemId,points_tes,points_rat,Result
0,18,5.0,3.0,4.0
1,16,4.0,1.0,2.5
5,3,0.0,5.0,2.5
6,13,0.0,4.0,2.0
2,17,3.0,0.0,1.5
3,35,2.0,0.0,1.0
7,2,0.0,2.0,1.0
4,7,1.0,0.0,0.5


In [38]:
item = pd.read_csv('data/modal/all_item.csv')

In [39]:
rec_item_df = item.loc[item.itemId.isin(topRecommendations.itemId)]

In [40]:
topRecommendationsTitle= pd.merge(
    topRecommendations, rec_item_df,
    on='itemId', how='outer').fillna(0)
topRecommendationsTitle

Unnamed: 0,itemId,points_tes,points_rat,Result,itemname,Category
0,18,5.0,3.0,4.0,Kindle Oasis E-reader with Leather Charging Co...,Kindle
1,16,4.0,1.0,2.5,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",Tablet
2,3,0.0,5.0,2.5,Amazon 9W PowerFast Official OEM USB Charger a...,Adapter
3,13,0.0,4.0,2.0,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi...",Tablet
4,17,3.0,0.0,1.5,Kindle Oasis E-reader with Leather Charging Co...,Kindle
5,35,2.0,0.0,1.0,Roll over image to zoom in Ultimate Ears BOOM ...,Speaker
6,2,0.0,2.0,1.0,Amazon Fire TV with 4K Ultra HD and Alexa Voic...,TV
7,7,1.0,0.0,0.5,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",Tablet


In [42]:
print("Top recommendation for user", 1580)
topRecommendationsTitle[['itemId', 'itemname','Category']].head(5)


Top recommendation for user 1580


Unnamed: 0,itemId,itemname,Category
0,18,Kindle Oasis E-reader with Leather Charging Co...,Kindle
1,16,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",Tablet
2,3,Amazon 9W PowerFast Official OEM USB Charger a...,Adapter
3,13,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi...",Tablet
4,17,Kindle Oasis E-reader with Leather Charging Co...,Kindle
