In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

import json
import ast

import tqdm

from pyspark.sql import SparkSession 

from datetime import datetime

import numpy as np

import random

MONTHTEST = 3
DAY = 2018

In [2]:
spark = SparkSession \
       .builder \
       .master('yarn') \
       .enableHiveSupport() \
       .getOrCreate()

In [3]:
DICT_W_FOR_PAGE_TYPE = {"Card" : 3,
                        "CardJK" : 2,
                        "Listing" : 1,
                        "ListingFavorites" : 5}

DICT_W_FOR_EVENT_TYPE = {"card_show" : 3,
                        "phone_show" : 10}


data = [Row(page_type='Card', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["Card"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='CardJK', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["CardJK"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='Listing', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["Listing"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='ListingFavorites', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["ListingFavorites"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='Card', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["Card"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='CardJK', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["CardJK"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='Listing', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["Listing"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='ListingFavorites', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["ListingFavorites"] * DICT_W_FOR_EVENT_TYPE["phone_show"])] 

dfdict = spark.createDataFrame(sc.parallelize(data))
dfdict.createOrReplaceTempView("dfdict")

In [4]:
user_item = spark.sql("""select a.user_id, a.offer_id from 
                            (select user_id, offer_id
                                from prod.mles_sopr
                                where ptn_dadd between '2019-06-01' and '2019-06-21') as a
                        inner join 
                            (select user_id, offer_id
                                from prod.mles_sopr
                                where ptn_dadd between '2019-06-22' and '2019-07-03') as b
                        on a.user_id = b.user_id 
                        where a.user_id != 'noid'
                        limit 10000
                  """).repartition(100).createOrReplaceTempView("user_item")



#df_sopr.createOrReplaceTempView("df_sopr")

In [5]:
sqlDF = spark.sql("""select distinct a.offer_num, a.offer_id, b.user_num, b.user_id, c.value, ptn_dadd
                    from (select user_id, offer_id, value, ptn_dadd
                            from dfdict as a, prod.mles_sopr as b 
                            where a.page_type = b.page_type and a.event_type = b.event_type) as c
                         INNER JOIN (select row_number() OVER (ORDER BY a.user_id) as user_num, a.user_id 
                            from (select distinct user_id from user_item) as a) as b on c.user_id = b.user_id
                         INNER JOIN (select row_number() OVER (ORDER BY a.offer_id) as offer_num, a.offer_id 
                            from (select distinct offer_id from user_item) as a) as a on c.offer_id = a.offer_id 
                  """).repartition(100)

In [6]:
sqlDFtrain = sqlDF.where("ptn_dadd between '2019-06-01' and '2019-06-21'")

In [7]:
sqlDFtest = sqlDF.where("ptn_dadd between '2019-06-22' and '2019-07-03'")

In [8]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_num", itemCol="offer_num", ratingCol="value",
          coldStartStrategy="drop")
model = als.fit(sqlDFtrain)

In [9]:
# Generate top 10 movie recommendations for each user
k = 10

userRecs = model.recommendForAllUsers(k)

In [10]:
num_users = spark.sql("""select count(*) from (select distinct user_id from user_item)
                  """).collect()[0]['count(1)']

In [13]:
sum = 0
with tqdm.tqdm(total=20) as progress:
    for j in range(20):
        progress.update(1)
        user = int(random.random() * num_users)
        t = sqlDFtest.where("user_num = " + str(user)).select("offer_num").collect()
        viewed = [i['offer_num'] for i in t]
        if len(viewed) == 0:
            continue

        t = userRecs.where('user_num = ' + str(user)).collect()
        if len(t) == 0:
            continue

        t = t[0]['recommendations']
        recom = [i['offer_num'] for i in t]


        sum += len(np.intersect1d(recom, viewed))/k
        #print("    ", user, " sum =", sum)


100%|██████████| 20/20 [16:58<00:00, 51.68s/it]
