# Embedding

## Data Prepare
### 1. Download sample data

In [1]:
import requests

def download_file(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as file:
        file.write(response.content)

url = 'https://raw.githubusercontent.com/dream-365/SparrowRecSys/master/src/main/resources/webroot/sampledata/ratings.csv'
save_path = 'ratings.csv'

download_file(url, save_path)

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
import Embedding

conf = SparkConf().setAppName('ctrModel').setMaster('local[4]')
spark = SparkSession.builder.config(conf=conf).getOrCreate()

rawSampleDataPath = "ratings.csv"
embLength = 10

## Item Embedding

In [3]:
user_watching_seqs = Embedding.processItemSequence(spark, rawSampleDataPath)

# samples preview
pd.DataFrame(user_watching_seqs.take(100))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,150,380,592,165,344,588,231,356,595,480,...,,,,,,,,,,
1,145,296,593,318,110,457,380,589,527,780,...,,,,,,,,,,
2,919,356,527,593,47,,,,,,...,,,,,,,,,,
3,555,318,593,356,296,110,778,50,858,527,...,,,,,,,,,,
4,842,296,413,368,267,260,145,593,165,377,...,410,252,440,161,367,858,18,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,736,648,62,608,733,260,786,79,628,671,...,,,,,,,,,,
96,900,922,942,594,899,914,918,919,938,945,...,,,,,,,,,,
97,661,107,60,1,919,223,260,899,480,592,...,,,,,,,,,,
98,29,912,593,592,500,50,36,21,,,...,,,,,,,,,,


### Item2Vec

In [4]:
item_emb_model = Embedding.trainItem2vec(spark, user_watching_seqs, embLength,
                          embOutputPath="emb/item2vecEmb.csv", saveToRedis=False,
                          redisKeyPrefix="i2vEmb")

In [5]:
# search top 5 similar movies with movie id "99" by cosine similarity
synonyms = item_emb_model.findSynonyms("99", 5)
for synonym, cosineSimilarity in synonyms:
        print(synonym, cosineSimilarity)

766 0.9405038952827454
627 0.9352996945381165
78 0.9332966208457947
96 0.9319654107093811
639 0.922230064868927


### Graph Embedding

In [6]:
item_graphemb_model = Embedding.graphEmb(user_watching_seqs, spark, embLength, 
                                         embOutputFilename="emb/itemGraphEmb.csv",
                                         saveToRedis=True, redisKeyPrefix="graphEmb")

In [7]:
synonyms = item_graphemb_model.findSynonyms("99", 5)
for synonym, cosineSimilarity in synonyms:
        print(synonym, cosineSimilarity)

896 0.9658063054084778
627 0.9565341472625732
116 0.922584593296051
68 0.9199870228767395
43 0.9190088510513306


## User Embedding 

In [None]:
from pyspark.sql.types import *

user_ratting_samples = spark.read.format("csv").option("header", "true").load(rawSampleDataPath)
vectors_list = []
for key, value in item_emb_model.getVectors().items():
    vectors_list.append((key, list(value)))
    fields = [
        StructField('movieId', StringType(), False),
        StructField('emb', ArrayType(FloatType()), False)
    ]

schema = StructType(fields)
vectors_df = spark.createDataFrame(vectors_list, schema=schema)
user_ratting_samples = user_ratting_samples.join(vectors_df, on='movieId', how='inner')

pd.DataFrame(user_ratting_samples.take(5))

In [None]:
user_ratting_samples_pair = user_ratting_samples.select('userId', 'emb').rdd.map(lambda x: (x[0], x[1]))
pd.DataFrame(user_ratting_samples_pair.take(5))

In [None]:
# acculate user rated movie ebm as user emb
user_emb = user_ratting_samples_pair.reduceByKey(lambda a, b: [a[i] + b[i] for i in range(len(a))])

pd.DataFrame(user_emb.take(5))