In [1]:
from __future__ import print_function

import argparse
import codecs
import logging
import time

import numpy as np
import tqdm

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.datasets.movielens import get_movielens
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

In [2]:
import json
import ast

from pyspark.sql import Row
from pyspark.sql import SparkSession 

In [3]:
spark = SparkSession \
       .builder \
       .master('yarn') \
       .enableHiveSupport() \
       .getOrCreate()

In [4]:
df_sopr = spark.read.table('prod.mles_sopr')
#df_sopr.printSchema()

In [5]:
rdd_sopr = df_sopr \
            .select("user_id", "offer_id", 'page_type', 'event_type') \
            .dropDuplicates() \
            .rdd
#rdd_sopr = rf_sopr_s.rdd

In [6]:
DICT_W_FOR_PAGE_TYPE = {"Card" : 3,
                        "CardJK" : 2,
                        "Listing" : 1,
                        "ListingFavorites" : 5}

DICT_W_FOR_EVENT_TYPE = {"card_show" : 3,
                        "phone_show" : 10}


data = [Row(page_type='Card', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["Card"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='CardJK', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["CardJK"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='Listing', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["Listing"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='ListingFavorites', event_type='card_show', value = DICT_W_FOR_PAGE_TYPE["ListingFavorites"] * DICT_W_FOR_EVENT_TYPE["card_show"]), 
        Row(page_type='Card', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["Card"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='CardJK', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["CardJK"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='Listing', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["Listing"] * DICT_W_FOR_EVENT_TYPE["phone_show"]), 
        Row(page_type='ListingFavorites', event_type='phone_show', value = DICT_W_FOR_PAGE_TYPE["ListingFavorites"] * DICT_W_FOR_EVENT_TYPE["phone_show"])] 

dfdict = spark.createDataFrame(sc.parallelize(data))
dfdict.createOrReplaceTempView("dfdict")

In [7]:
sqlDF = spark.sql("""select distinct a.offer_num, a.offer_id, b.user_num, b.user_id, c.timestamp, c.value
                    from (select row_number() OVER (ORDER BY a.offer_id) as offer_num, a.offer_id 
                            from (select distinct offer_id from prod.mles_sopr) as a) as a,
                         (select row_number() OVER (ORDER BY a.user_id) as user_num, a.user_id 
                            from (select distinct user_id from prod.mles_sopr) as a) as b,
                         (select user_id, timestamp, offer_id, value
                            from dfdict as a, prod.mles_sopr as b 
                            where a.page_type = b.page_type and a.event_type = b.event_type) as c
                         where c.offer_id = a.offer_id 
                               and c.user_id = b.user_id
                               and b.user_id != 'noid'""")

In [8]:
import datetime
DICT_W_FOR_PAGE_TYPE = {"Card" : 3,
                        "CardJK" : 2,
                        "Listing" : 1,
                        "ListingFavorites" : 5}

DICT_W_FOR_EVENT_TYPE = {"card_show" : 3,
                        "phone_show" : 10}

#разделение на 9 частей по времени
def lambdaForArr(x):
    return (x['user_id'], [x['offer_id'], 
                           DICT_W_FOR_PAGE_TYPE[x['page_type']] * DICT_W_FOR_EVENT_TYPE[x['event_type']]])

In [None]:
arr = sqlDF.collect()
          #.randomSplit([1, 500])[0].collect()

In [None]:
sqlDF.take(10)

In [None]:
arr[1]['offer_num']

In [None]:
len(arr)

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

def get_mtrx(arr):
    with tqdm.tqdm(total=len(arr)) as progress:
        row = []
        col = []
        data = []

        for i in arr:
            row.append(i['user_num'])
            col.append(i['offer_num'])
            data.append(i['value'])
            progress.update(1)

        #data = np.ones(len(indices))
        #mtrx = csr_matrix((data, indices, indptr))
    return csr_matrix((data, (row, col)))#.transpose()

In [None]:
mtrx = get_mtrx(arr)

In [None]:
model = AlternatingLeastSquares()

In [None]:
model.fit(mtrx)