In [1]:
from pyspark.mllib.linalg.distributed import RowMatrix

from pyspark.sql import Row
from pyspark.sql import SparkSession 

from pyspark.mllib.linalg import Vectors

import tqdm

from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.linalg import SparseVector

from scipy.sparse import find

import matplotlib.pyplot as plt
import numpy as np

In [2]:
spark = SparkSession \
       .builder \
       .master('yarn') \
       .enableHiveSupport() \
       .getOrCreate()

#.master('yarn') \

In [3]:
DICT_W_FOR_PAGE_TYPE = {"Card" : 3,
                        "CardJK" : 2,
                        "Listing" : 1,
                        "ListingFavorites" : 5}

DICT_W_FOR_EVENT_TYPE = {"card_show" : 3,
                        "phone_show" : 10}

data=[('Card', 'card_show', int(DICT_W_FOR_PAGE_TYPE["Card"] * DICT_W_FOR_EVENT_TYPE["card_show"])), 
        ('CardJK', 'card_show', int(DICT_W_FOR_PAGE_TYPE["CardJK"] * DICT_W_FOR_EVENT_TYPE["card_show"])), 
        ('Listing', 'card_show', int(DICT_W_FOR_PAGE_TYPE["Listing"] * DICT_W_FOR_EVENT_TYPE["card_show"])), 
        ('ListingFavorites', 'card_show', int(DICT_W_FOR_PAGE_TYPE["ListingFavorites"] * DICT_W_FOR_EVENT_TYPE["card_show"])), 
        ('Card', 'phone_show', int(DICT_W_FOR_PAGE_TYPE["Card"] * DICT_W_FOR_EVENT_TYPE["phone_show"])), 
        ('CardJK', 'phone_show', int(DICT_W_FOR_PAGE_TYPE["CardJK"] * DICT_W_FOR_EVENT_TYPE["phone_show"])), 
        ('Listing', 'phone_show', int(DICT_W_FOR_PAGE_TYPE["Listing"] * DICT_W_FOR_EVENT_TYPE["phone_show"])), 
        ('ListingFavorites', 'phone_show', int(DICT_W_FOR_PAGE_TYPE["ListingFavorites"] * DICT_W_FOR_EVENT_TYPE["phone_show"]))]

#spark.createDataFrame(data, ['page_type', 'event_type', 'value']).collect() 

In [4]:
dfdict = spark.createDataFrame(data, ['page_type', 'event_type', 'value'])
dfdict.createOrReplaceTempView("dfdict")

In [5]:
def compareRdd(x,y):
    if (x['offer_num'] > y['offer_num']):
        return x
    else:
        return y

In [6]:
def createVector(x,y):
    if type(x) is list:
        if type(y) is list:
            return x + y
        elif type(y) is tuple:
            return x + [y]
        else: 
            raise BaseException('Wrong type of y')
    elif type(x) is tuple:
        if type(y) is list:
            return [x] + y
        elif type(y) is tuple:
            return [x] + [y]
        else: 
            raise BaseException('Wrong type of y')
    else:
        raise BaseException('Wrong type of x')

In [7]:
def sortVector(a):
    if type(a[1]) is list:
        a[1].sort(key=lambda t: t[0])
        #print(a)
        b = []
        c = []

        i = 0
        while True:
            #print(i)
            if a[1][i][0] == a[1][i+1][0]:
                if a[1][i][1] < a[1][i+1][1]:
                    t = a[1].pop(i)
                else:
                    t = a[1].pop(i+1)
                #print('pop', t)

            else:
                i = i + 1
                b.append(a[1][i][0])
                c.append(a[1][i][1])
                #print('add', (a[1][i][0], a[1][i][1]))
            if i == len(a[1]) - 1:
                break

        return (a[0], (b, c))
    elif type(a[1]) is tuple:
        return (a[0], ([a[1][0]], [a[1][1]]))
    else:
        raise BaseException('Wrong type of a: ' + str(type(a[1])))

In [8]:
rdd = spark.sql("""select distinct a.offer_num, a.offer_id, b.user_num, b.user_id, c.value
                    from (select user_id, offer_id, value
                            from dfdict as a, prod.mles_sopr as b 
                            where a.page_type = b.page_type and a.event_type = b.event_type) as c
                         INNER JOIN (select row_number() OVER (ORDER BY a.user_id) as user_num, a.user_id 
                            from (select distinct user_id from prod.mles_sopr limit 10000) as a) as b on c.user_id = b.user_id
                         LEFT JOIN (select row_number() OVER (ORDER BY a.offer_id) as offer_num, a.offer_id 
                            from (select distinct offer_id from prod.mles_sopr) as a) as a on c.offer_id = a.offer_id 
                         where b.user_id != 'noid'""") \
              .rdd.repartition(100)

In [1]:
maxOfferNum = rdd.reduce(lambda x, y: compareRdd(x,y))['offer_num']

NameError: name 'rdd' is not defined

In [None]:
target = rdd.map(lambda x: (x['user_num'], (x['offer_num'], x['value']))) \
            .reduceByKey(lambda x,y: createVector(x, y)) \
            .map(lambda x: sortVector(x)) \
            .map(lambda x: Vectors.sparse(maxOfferNum + 1, x[1][0], x[1][1]))

In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [None]:
targetSize = target.count()

In [None]:
X = target.randomSplit([(targetSize - 500)/targetSize, 500/targetSize])[1]

In [None]:
X.count()

In [None]:
MAX_K = 0
STEP_K = 100
cost = np.zeros(10)

with tqdm.tqdm(total=10) as progress:
    for i in range(10):
        model = KMeans.train(X,
                     i*10 + 1, 
                     initializationMode="k-means||", 
                     initializationSteps=5, 
                     epsilon=1e-4)
        
        cost[i] = model.computeCost(X)
        print(i*10 + 1, cost[i])
        progress.update(1)

In [None]:
fig = plt.figure(figsize =(12,10))
plt.plot(np.arange(10) * 10 + 1, cost)

plt.title('cost from num_klasters')
plt.ylabel('cost')
plt.xlabel('num_klasters')

plt.grid(True)


plt.show()