In [1]:
import findspark
import os
spark_home = r"C:\Program Files\spark-3.2.0-bin-hadoop3.2"
python_path = r"C:\Users\fywud\anaconda3\envs\pyspark_python\python"
findspark.init(spark_home,python_path)
import pyspark 
from pyspark import SparkContext, SparkConf

In [2]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.mllib.feature import Word2Vec
from pyspark.ml.linalg import Vectors
import random
from collections import defaultdict
import numpy as np
from pyspark.sql import functions as F

In [3]:
conf = SparkConf().setAppName('ctrModel').setMaster('local')
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
class UdfFunction:
    @staticmethod
    def sortF(movie_list, timestamp_list):
        """
        sort by time and return the corresponding movie sequence
        eg:
            input: movie_list:[1,2,3]
                   timestamp_list:[1112486027,1212546032,1012486033]
            return [3,1,2]
        """
        pairs = []
        for m, t in zip(movie_list, timestamp_list):
            pairs.append((m, t))
        # sort by time
        pairs = sorted(pairs, key=lambda x: x[1])
        return [x[0] for x in pairs]


def processItemSequence(spark, rawSampleDataPath):
    # rating data
    ratingSamples = spark.read.format("csv").option("header", "true").load(rawSampleDataPath)
    # ratingSamples.show(5)
    # ratingSamples.printSchema()
    sortUdf = udf(UdfFunction.sortF, ArrayType(StringType()))
    userSeq = ratingSamples \
        .where(F.col("rating") >= 3.5) \
        .groupBy("userId") \
        .agg(sortUdf(F.collect_list("movieId"), F.collect_list("timestamp")).alias('movieIds')) \
        .withColumn("movieIdStr", array_join(F.col("movieIds"), " "))
    # userSeq.select("userId", "movieIdStr").show(10, truncate = False)
    return userSeq.select('movieIdStr').rdd.map(lambda x: x[0].split(' '))

def trainItem2vec(spark, samples, embLength, embOutputPath, saveToRedis, redisKeyPrefix):
    word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(5).setNumIterations(10)
    model = word2vec.fit(samples)
    synonyms = model.findSynonyms("158", 20)
    for synonym, cosineSimilarity in synonyms:
        print(synonym, cosineSimilarity)
    embOutputDir = '/'.join(embOutputPath.split('/')[:-1])
    if not os.path.exists(embOutputDir):
        os.makedirs(embOutputDir)
    with open(embOutputPath, 'w') as f:
        for movie_id in model.getVectors():
            vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
            f.write(movie_id + ":" + vectors + "\n")
    embeddingLSH(spark, model.getVectors())
    return model

def embeddingLSH(spark, movieEmbMap):
    movieEmbSeq = []
    for key, embedding_list in movieEmbMap.items():
        embedding_list = [np.float64(embedding) for embedding in embedding_list]
        movieEmbSeq.append((key, Vectors.dense(embedding_list)))
    movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb")
    bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="emb", outputCol="bucketId", bucketLength=0.1,
                                                      numHashTables=3)
    bucketModel = bucketProjectionLSH.fit(movieEmbDF)
    embBucketResult = bucketModel.transform(movieEmbDF)
    print("movieId, emb, bucketId schema:")
    embBucketResult.printSchema()
    print("movieId, emb, bucketId data result:")
    embBucketResult.show(10, truncate=False)
    print("Approximately searching for 5 nearest neighbors of the sample embedding:")
    sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237)
    bucketModel.approxNearestNeighbors(movieEmbDF, sampleEmb, 5).show(truncate=False)

In [5]:
file_path = r'C:\Working\GitHub\SparrowRecSys\src\main\resources'
rawSampleDataPath = file_path + "/webroot/sampledata/ratings.csv"
embLength = 10


In [6]:
ratingSamples = spark.read.format("csv").option("header", "true").load(rawSampleDataPath)
ratingSamples.show(10)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
|     1|    112|   3.5|1094785740|
|     1|    151|   4.0|1094785734|
|     1|    223|   4.0|1112485573|
|     1|    253|   4.0|1112484940|
|     1|    260|   4.0|1112484826|
+------+-------+------+----------+
only showing top 10 rows



In [7]:
class UdfFunction:
    @staticmethod
    def sortF(movie_list, timestamp_list):
        """
        sort by time and return the corresponding movie sequence
        eg:
            input: movie_list:[1,2,3]
                   timestamp_list:[1112486027,1212546032,1012486033]
            return [3,1,2]
        """
        pairs = []
        for m, t in zip(movie_list, timestamp_list):
            pairs.append((m, t))
        # sort by time
        pairs = sorted(pairs, key=lambda x: x[1])
        return [x[0] for x in pairs]

In [8]:
sortUdf = udf(UdfFunction.sortF, ArrayType(StringType()))
userSeq = ratingSamples \
        .where(F.col("rating") >= 3.5) \
        .groupBy("userId") \
        .agg(sortUdf(F.collect_list("movieId"), F.collect_list("timestamp")).alias('movieIds')) \
        .withColumn("movieIdStr", array_join(F.col("movieIds"), " "))
userSeq.show(10)
samples = userSeq.select('movieIdStr').rdd.map(lambda x: x[0].split(' '))
samples.take(10)

+------+--------------------+--------------------+
|userId|            movieIds|          movieIdStr|
+------+--------------------+--------------------+
|     1|[924, 919, 337, 1...|924 919 337 151 1...|
|    10|[527, 912, 260, 8...|527 912 260 858 9...|
|   100|[296, 593, 288, 5...|296 593 288 50 35...|
|  1000|               [145]|                 145|
| 10000|[150, 380, 592, 1...|150 380 592 165 3...|
| 10001|[296, 858, 608, 5...|296 858 608 593 7...|
| 10002|[150, 296, 380, 5...|150 296 380 590 3...|
| 10003|[17, 141, 58, 640...|17 141 58 640 26 ...|
| 10004|[153, 316, 593, 1...|153 316 593 110 1...|
| 10005|[296, 457, 316, 3...|296 457 316 356 4...|
+------+--------------------+--------------------+
only showing top 10 rows



[['924',
  '919',
  '337',
  '151',
  '112',
  '50',
  '541',
  '593',
  '29',
  '293',
  '47',
  '296',
  '318',
  '32',
  '260',
  '253',
  '589',
  '223',
  '367',
  '2'],
 ['527', '912', '260', '858', '969', '11', '25', '1'],
 ['296',
  '593',
  '288',
  '50',
  '356',
  '235',
  '293',
  '265',
  '527',
  '25',
  '223',
  '608',
  '342',
  '610',
  '162',
  '708',
  '535',
  '538',
  '673',
  '680',
  '728',
  '562',
  '260',
  '778'],
 ['145'],
 ['150',
  '380',
  '592',
  '165',
  '344',
  '588',
  '231',
  '356',
  '595',
  '480',
  '10',
  '185',
  '339',
  '377',
  '364',
  '500',
  '34',
  '587',
  '597',
  '19',
  '539',
  '39',
  '266',
  '317',
  '11',
  '2',
  '1',
  '508',
  '736',
  '780',
  '62',
  '196',
  '252',
  '368',
  '158',
  '261',
  '648',
  '370',
  '48',
  '594',
  '520',
  '31',
  '277',
  '468',
  '5',
  '708',
  '7',
  '27',
  '616',
  '361',
  '46',
  '237',
  '107',
  '289',
  '585',
  '719',
  '104',
  '351',
  '552',
  '637',
  '135',
  '248',
  '94

In [9]:
word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(5).setNumIterations(10)
model = word2vec.fit(samples)
synonyms = model.findSynonyms("158", 20)
for synonym, cosineSimilarity in synonyms:
    print(synonym, cosineSimilarity)

256 0.9552728533744812
48 0.9530795812606812
186 0.9236512780189514
31 0.9094273447990417
252 0.866692304611206
277 0.8644675612449646
355 0.8633851408958435
168 0.8555343151092529
276 0.8513758778572083
552 0.8314193487167358
236 0.8073710799217224
520 0.7977898120880127
432 0.7975228428840637
455 0.7934821248054504
44 0.783719003200531
169 0.7653695344924927
2 0.765035092830658
333 0.7566123008728027
237 0.7321105003356934
368 0.7295622825622559


In [10]:
embOutputDir =  r'C:\Working\GitHub\SparrowRecSys\src\main\resources' + r"/webroot/modeldata2"
if not os.path.exists(embOutputDir):
    os.makedirs(embOutputDir)
model.getVectors()

{'710': [1.1889405, -0.7620199, -0.815866, -1.3322642, -1.4549716, -0.79412335, -0.7304687, -0.93377864, 0.03819812, -0.047834165], '205': [0.193592, -0.51778346, -0.21647744, 0.105278276, -0.48911607, -0.07111606, 0.19604881, -0.8898711, 0.5834694, -0.25911775], '45': [-0.21414319, -0.51845336, -0.25278765, 0.47342923, -0.20595376, 0.49308327, 0.30135408, -0.48917723, 0.11762793, -0.543826], '515': [-0.13251178, -0.25639167, -0.016346203, 1.0205768, 0.11816909, 0.12240034, 0.2226877, -0.7096785, -0.049481366, -0.46599394], '574': [0.060444687, -0.6409353, -0.099303454, -0.15613183, -0.4736273, 0.5575293, 0.22735983, -0.8689255, 0.20649548, -0.29318124], '858': [-0.160359, 0.29165512, 0.21874523, -0.30082446, -0.21685793, -0.06856815, -0.0027046874, -0.23580065, -0.6988469, 0.18436927], '619': [0.6916597, -0.50561684, -0.43609518, -0.6458255, -0.95440125, -0.56582767, -0.42994735, -0.65652055, 0.043810386, -0.16419798], '507': [-0.07992124, -0.30779102, -0.39863032, 0.24702363, -0.1443

In [11]:
embOutputPath = r'C:\Working\GitHub\SparrowRecSys\src\main\resources' + r"/webroot/modeldata2/item2vecEmb.csv"
with open(embOutputPath, 'w') as f:
    for movie_id in model.getVectors():
        vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
        f.write(movie_id + ":" + vectors + "\n")

In [12]:
movieEmbSeq = []
movieEmbMap = model.getVectors()
for key, embedding_list in movieEmbMap.items():
    embedding_list = [np.float64(embedding) for embedding in embedding_list]
    movieEmbSeq.append((key, Vectors.dense(embedding_list)))
movieEmbSeq

[('710',
  DenseVector([1.1889, -0.762, -0.8159, -1.3323, -1.455, -0.7941, -0.7305, -0.9338, 0.0382, -0.0478])),
 ('205',
  DenseVector([0.1936, -0.5178, -0.2165, 0.1053, -0.4891, -0.0711, 0.196, -0.8899, 0.5835, -0.2591])),
 ('45',
  DenseVector([-0.2141, -0.5185, -0.2528, 0.4734, -0.206, 0.4931, 0.3014, -0.4892, 0.1176, -0.5438])),
 ('515',
  DenseVector([-0.1325, -0.2564, -0.0163, 1.0206, 0.1182, 0.1224, 0.2227, -0.7097, -0.0495, -0.466])),
 ('574',
  DenseVector([0.0604, -0.6409, -0.0993, -0.1561, -0.4736, 0.5575, 0.2274, -0.8689, 0.2065, -0.2932])),
 ('858',
  DenseVector([-0.1604, 0.2917, 0.2187, -0.3008, -0.2169, -0.0686, -0.0027, -0.2358, -0.6988, 0.1844])),
 ('619',
  DenseVector([0.6917, -0.5056, -0.4361, -0.6458, -0.9544, -0.5658, -0.4299, -0.6565, 0.0438, -0.1642])),
 ('507',
  DenseVector([-0.0799, -0.3078, -0.3986, 0.247, -0.1443, 0.3723, -0.0589, -0.1946, 0.289, 0.385])),
 ('113',
  DenseVector([0.974, -0.3896, -0.1727, -0.3336, -1.1538, -0.7446, -0.6998, -1.0031, 0.2095

In [13]:
movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb")
bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="emb", outputCol="bucketId", bucketLength=0.1,
                                                  numHashTables=3)
bucketModel = bucketProjectionLSH.fit(movieEmbDF)
embBucketResult = bucketModel.transform(movieEmbDF)
print("movieId, emb, bucketId schema:")
embBucketResult.printSchema()
print("movieId, emb, bucketId data result:")
embBucketResult.show(10, truncate=False)
print("Approximately searching for 5 nearest neighbors of the sample embedding:")
sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237)
bucketModel.approxNearestNeighbors(movieEmbDF, sampleEmb, 5).show(truncate=False)

movieId, emb, bucketId schema:
root
 |-- movieId: string (nullable = true)
 |-- emb: vector (nullable = true)
 |-- bucketId: array (nullable = true)
 |    |-- element: vector (containsNull = true)

movieId, emb, bucketId data result:
+-------+----------------------------------------------------------------------------------------------------------------------+------------------------+
|movieId|emb                                                                                                                   |bucketId                |
+-------+----------------------------------------------------------------------------------------------------------------------+------------------------+
|710    |[1.1889405,-0.7620199,-0.815866,-1.3322642,-1.4549716,-0.79412335,-0.7304687,-0.93377864,0.03819812,-0.047834165]     |[[4.0], [2.0], [-17.0]] |
|205    |[0.193592,-0.51778346,-0.21647744,0.105278276,-0.48911607,-0.07111606,0.19604881,-0.8898711,0.5834694,-0.25911775]    |[[-2.0], [7.0], [-6.0]



+-------+------------------------------------------------------------------------------------------------------------------+----------------------+------------------+
|movieId|emb                                                                                                               |bucketId              |distCol           |
+-------+------------------------------------------------------------------------------------------------------------------+----------------------+------------------+
|17     |[0.23109004,0.5215586,0.70344085,0.7726257,-0.17208754,0.048366632,-0.13422923,-0.48887217,0.26885578,-0.35862553]|[[3.0], [3.0], [6.0]] |1.4615353353021405|
|105    |[0.30033252,0.20595297,-0.21237604,0.8982644,-0.09205772,0.0683547,0.028714016,-0.23991285,0.173825,-0.63341165]  |[[-1.0], [0.0], [6.0]]|1.926736939951792 |
|350    |[0.56325704,0.5609271,-0.40829957,0.45993444,0.34602553,0.28407016,-0.32575956,-0.18179427,0.39605764,-0.60833174]|[[-3.0], [0.0], [6.0]]|2.074120332100455 

In [14]:
usrEmbOutputPath=r'C:\Working\GitHub\SparrowRecSys\src\main\resources' + r"/webroot/modeldata2/userEmb.csv"
Vectors_list = []
for key, value in model.getVectors().items():
    Vectors_list.append((key, list(value)))
fields = [
    StructField('movieId', StringType(), False),
    StructField('emb', ArrayType(FloatType()), False)
]
schema = StructType(fields)
Vectors_df = spark.createDataFrame(Vectors_list, schema=schema)


In [15]:
ratingSamples = ratingSamples.join(Vectors_df, on='movieId', how='inner')
ratingSamples.show(10)

+-------+------+------+----------+--------------------+
|movieId|userId|rating| timestamp|                 emb|
+-------+------+------+----------+--------------------+
|    296|     1|   4.0|1112484767|[-0.5079685, 0.26...|
|    296|     8|   5.0| 833973081|[-0.5079685, 0.26...|
|    296|    11|   3.5|1230858799|[-0.5079685, 0.26...|
|    296|    13|   5.0| 849082366|[-0.5079685, 0.26...|
|    296|    15|   3.0| 840206642|[-0.5079685, 0.26...|
|    296|    18|   4.0|1195573677|[-0.5079685, 0.26...|
|    296|    21|   5.0| 992188845|[-0.5079685, 0.26...|
|    296|    22|   5.0| 994638043|[-0.5079685, 0.26...|
|    296|    23|   5.0| 914457789|[-0.5079685, 0.26...|
|    296|    24|   5.0| 994071115|[-0.5079685, 0.26...|
+-------+------+------+----------+--------------------+
only showing top 10 rows



In [16]:
result = ratingSamples.select('userId', 'emb').rdd.map(lambda x: (x[0], x[1])) \
    .reduceByKey(lambda a, b: [a[i] + b[i] for i in range(len(a))]).collect()

In [17]:
with open(usrEmbOutputPath, 'w') as f:
    for row in result:
        vectors = " ".join([str(emb) for emb in row[1]])
        f.write(row[0] + ":" + vectors + "\n")