In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

import json
import ast

from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession \
       .builder \
       .master('yarn') \
       .enableHiveSupport() \
       .getOrCreate()

In [3]:
DICT_W_FOR_PAGE_TYPE = {"Card" : 3,
                        "CardJK" : 2,
                        "Listing" : 1,
                        "ListingFavorites" : 5}

DICT_W_FOR_EVENT_TYPE = {"card_show" : 3,
                        "phone_show" : 10}


data = [Row(page_type='Card', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["Card"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='CardJK', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["CardJK"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='Listing', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["Listing"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='ListingFavorites', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["ListingFavorites"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='Card', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["Card"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='CardJK', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["CardJK"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='Listing', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["Listing"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='ListingFavorites', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["ListingFavorites"] * DICT_W_FOR_EVENT_TYPE["phone_show"])] 

dfdict = spark.createDataFrame(sc.parallelize(data))
dfdict.createOrReplaceTempView("dfdict")

In [4]:
#df_sopr.createOrReplaceTempView("df_sopr")

sqlDF = spark.sql("""select row_num, a.user_id as user_id, timestamp, offer_id, value as rating
                  from (select row_number() OVER (ORDER BY a.user_id) as row_num, a.user_id 
                            from (select distinct user_id from prod.mles_sopr) as a) as a, 
                       (select user_id, timestamp, offer_id, value
                            from dfdict as a, prod.mles_sopr as b 
                            where a.page_type = b.page_type and a.event_type = b.event_type) as b 
                  where a.user_id = b.user_id 
                  """)

In [8]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="row_num", itemCol="offer_id", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(sqlDF)