In [1]:
import findspark
findspark.init()
from typing import *
from pyspark.sql import *
from pyspark.sql.types import  IntegerType,ArrayType,StringType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Word2Vec,Word2VecModel
from pyspark.sql.functions import *
import os

In [2]:
spark = SparkSession.builder.appName('enbbeding').master('local[*]').getOrCreate()

In [3]:
sent = ("a b " * 100 + "a c " * 10).split(" ")
doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])
doc

DataFrame[sentence: array<string>]

In [5]:
doc.show(10)

+--------------------+
|            sentence|
+--------------------+
|[a, b, a, b, a, b...|
|[a, b, a, b, a, b...|
+--------------------+



In [24]:
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence")
model = word2Vec.fit(doc)

In [25]:
model.getVectors().show()

+----+--------------------+
|word|              vector|
+----+--------------------+
|   a|[0.09461779892444...|
|   b|[1.15474212169647...|
|   c|[-0.3794820010662...|
+----+--------------------+



In [26]:
doc.printSchema()

root
 |-- sentence: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [28]:
type(model) 

pyspark.ml.feature.Word2VecModel

In [49]:
df=model.getVectors()

In [57]:
# for i,j in df.
df.collect()[1]

Row(word='b', vector=DenseVector([1.1547, -0.5933, -0.8722, 0.4669, 0.5515]))

In [58]:
for row in df.collect():
    print(row['word'],row['vector'])

a [0.0946177989244461,-0.4951631426811218,0.06406556069850922,-0.37930983304977417,0.21593928337097168]
b [1.1547421216964722,-0.593326210975647,-0.8721810579299927,0.4669361710548401,0.551497220993042]
c [-0.3794820010662079,0.34077689051628113,0.06388652324676514,0.0352821946144104,-0.24136029183864594]


In [12]:
import findspark
findspark.init()
from typing import *
from pyspark.sql import *
from pyspark.sql.types import  IntegerType,ArrayType,StringType
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import *
import os,random
from collections import defaultdict
import redis

HOST ='localhost'
PORT = 6379
def sortByTime(movieid_time_list:List):
    '''
    按照时间戳排序，返回movieids
    :param movieid_time_list:
    :return:
    '''
    movieid_time_list.sort(key=lambda x:x[1])
    mids = [i[0] for i in movieid_time_list]
    return mids

def processItemSequence(spark:SparkSession):
    '''
    处理评分数据，筛选评分大于3.5的，按照用户id分组获取评分电影序列
    :param spark:
    :return:
    '''

    df = spark.read.format('csv').option('header', 'true').load('./data/ratings.csv')
    df.printSchema()

    sortUdf = udf(f=sortByTime,returnType=ArrayType(StringType()))
    userSeq=df.where(df['rating'] >= 3.5).groupby('userId').agg(sortUdf(collect_list(struct('movieId','timestamp'))).alias('movieIds'))\
        .withColumn('movieIdStr',array_join('movieIds',' '))
    userSeq.show(5)

    #不使用udf，速度慢一点
    # userSeq=df.where(df['rating'] >= 3.5).sort('timestamp').groupby('userId').agg(collect_list('movieId').alias('movieIds')).withColumn('movieIdStr',array_join('movieIds',' ')).show(10)

    userSeq.printSchema()
    dataset = userSeq.select('movieIds')
    moviesCount = dataset.select(explode(col('movieIds'))).alias('tmp').distinct().count()
    print('unique high rating movies:{}'.format(moviesCount))
    # print(dataset.count())
    return dataset

def trainItem2vec(dataset,filename,saveToRedis=False,redisKeyPrefix=None):
    '''
    训练产生embedding,inputCol需要是 array（string）类型
    训练好后写入 filename
    :param dataset:
    :return:
    '''
    word2vec = Word2Vec(vectorSize=10,windowSize=5,maxIter=10,inputCol='movieIds')
    model = word2vec.fit(dataset)
    print('model fitted')
    # 打印相似电影，基于点积运算
    synonyms = model.findSynonymsArray('158',20)
    for moveid,similarity in synonyms:
        print('{}:{}'.format(moveid,similarity))

    with open('./modeldata/{}'.format(filename),'w') as f:
        for row in model.getVectors().collect():
            tmp=','.join([str(vector) for vector in row['vector']])
            f.write('{}:{}\n'.format(row['word'],tmp))

    # redis-cli eval "redis.call('del', unpack(redis.call('keys','*')))" 0 windows批量删除key
    if saveToRedis:
        pool = redis.ConnectionPool(host=HOST,port=PORT)
        # key的存活时间 秒
        ex = 60 * 10
        r = redis.Redis(connection_pool=pool)
        for i,row in enumerate(model.getVectors().collect()):
            tmp = ','.join([str(vector) for vector in row['vector']])
            if i == 1:
                print(type(row['vector']))
            r.set('{}:{}'.format(redisKeyPrefix,row['word']),tmp,ex)
    return model 


def dealPairMovie(movies:Row)->List:
    '''
    udf
    :param movies:
    :return:
    '''
    newl=[]
    movies = movies['movieIds']
    for i in range(len(movies)-1):
        newl.append((movies[i],movies[i+1]))
    return newl


def generateTransitionMatrix(dataset:DataFrame):
    '''
    生成状态转移矩阵
    :param dataset:
    :return:
    '''
    pairSamples=dataset.rdd.flatMap(dealPairMovie)
    pairSamples.cache()
    print(pairSamples.take(10))
    print('pairSamples over')
    # {(mid,mid2):count,...}

    pairCountMap = pairSamples.countByValue()

    print('pairCountMap_{}'.format(len(pairCountMap)))
    # 计数状态矩阵
    transitionCountMatrix = defaultdict(dict)
    itemCountMap = defaultdict(int)
    all_count=0
    for k,count in pairCountMap.items():
        transitionCountMatrix[k[0]][k[1]] = count
        itemCountMap[k[0]] +=count
        all_count+=count
    print('transitionCountMatrix over')
    #概率状态矩阵
    transitionMatrix = defaultdict(dict)
    itemDistribution = defaultdict(int)
    for a,cmap in transitionCountMatrix.items():
        for b,count in cmap.items():
            transitionMatrix[a][b] = float(count /itemCountMap[a])

    for k,count in itemCountMap.items():
        itemDistribution[k] = float(count / all_count)

    print('transitionMatrix_{}'.format(len(transitionMatrix)))
    print(transitionMatrix['858'])
    print('itemDistribution_{}'.format(len(itemDistribution)))
    print(itemDistribution['858'])
    return transitionMatrix,itemDistribution

def oneRandomWalk(transitionMatrix, itemDistribution, sampleLength):
    '''
    单次随机游走
    :param transitionMatrix:
    :param itemDistribution:
    :param sampleLength:
    :return:
    '''
    sample = []
    randomValue = random.random()
    firstItem=''
    accumulateProb=0

    # 按照电影分布，取第一部电影
    for k,v in itemDistribution.items():
        accumulateProb+=v
        if accumulateProb >= randomValue:
            firstItem=k
            break
    sample.append(firstItem)
    curItem = firstItem

    # 按照状态转移，取后面9部电影
    for i in range(1,sampleLength):
        if not transitionMatrix[curItem] or not itemDistribution[curItem]:
            break
        # 随机游走的策略
        randomProb = random.random()
        for k, prob in transitionMatrix[curItem].items():
            if randomProb >= prob:
                curItem = k
                break

        sample.append(curItem)
    return sample

def randomWalk(transitionMatrix,itemDistribution,sampleCount,sampleLength):
    '''
    随机游走
    :param transitionMatrix:
    :param itemDistribution:
    :param sampleCount:
    :param sampleLength:
    :return:
    '''
    samples = []
    for i in range(sampleCount):
        samples.append(oneRandomWalk(transitionMatrix, itemDistribution, sampleLength))
    return samples

def oneNode2vec(transitionMatrix, itemDistribution, sampleLength):

    p , q  = 0.1, 0.2
    sample = []
    randomValue = random.random()
    firstItem = ''
    accumulateProb = 0

    # 按照电影分布，取第一部电影
    for k, v in itemDistribution.items():
        accumulateProb += v
        if accumulateProb >= randomValue:
            firstItem = k
            break

    sample.append(firstItem)
    curItem = firstItem
    #nodeT始终是curElement的前一个值
    nodeT = curItem
    # 按照状态转移，取后面9部电影
    for i in range(1, sampleLength):
        if not transitionMatrix[curItem] or not itemDistribution[curItem]:
            break
        randomProb = random.random()
        # 第一步时，curItem和nodeT是同一个点，所以要保持nodeT不动，curIte前进一步
        if i == 1:
            for item, prob in transitionMatrix[curItem].items():
                if randomProb >= prob:
                    curItem = item
                    break
        else:
            for item, prob in transitionMatrix[curItem].items():
                # 跳回前一节点
                if item == nodeT:
                    prob = prob * 1 / p
                #distince =1
                elif item in transitionMatrix[nodeT]:
                    prob = prob
                #distince =2
                else:
                    prob = prob * 1/q

                if randomProb >= prob:
                    nodeT = curItem
                    curItem = item
                    break
        sample.append(curItem)

    return sample


def node2vec(transitionMatrix,itemDistribution,sampleCount,sampleLength):
    samples = []
    for i in range(sampleCount):
        samples.append(oneNode2vec(transitionMatrix, itemDistribution, sampleLength))
    return samples

def graphEmb(dataset:DataFrame,spark:SparkSession,embOutputFilename,saveToRedis=False,redisKeyPrefix=None):
    '''
    图enbding
    :param dataset:
    :param spark:
    :param embOutputFilename:
    :return:
    '''
    transitionMatrix, itemDistribution=generateTransitionMatrix(dataset)
    sampleCount = 20000
    sampleLength = 10

    # newSamples=randomWalk(transitionMatrix, itemDistribution, sampleCount, sampleLength)
    newSamples = node2vec(transitionMatrix, itemDistribution, sampleCount, sampleLength)

    # 转为rdd
    rddSamples=spark.sparkContext.parallelize([Row(movieIds=i) for i in newSamples])
    print(newSamples[:10])
    print(rddSamples.take(10))
    # 转为DataFrame
    dataFrameSamples = spark.createDataFrame(rddSamples)
    print(type(dataFrameSamples))
    print(dataFrameSamples.take(10))
    # trainItem2vec(dataFrameSamples,embOutputFilename,saveToRedis,redisKeyPrefix)

In [3]:
spark = SparkSession.builder.appName('enbbeding').master('local[*]').getOrCreate()
df = spark.read.format('csv').option('header', 'true').load('./data/ratings.csv')

In [11]:
um=df.groupBy('userId').agg(collect_list(col('movieId')).alias('movieIds'))
um.show()

+---------+
|lengthMov|
+---------+
|    29776|
+---------+



In [13]:
dataset=processItemSequence(spark)
print(type(dataset))
model=trainItem2vec(dataset,'item2vecEmb1.txt',saveToRedis=True,redisKeyPrefix='i2vEmb')
rows =model.getVectors().collect()
movdict={}
for row in rows:
    movdict[row['word']] = list(row['vector'])

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+--------------------+--------------------+
|userId|            movieIds|          movieIdStr|
+------+--------------------+--------------------+
| 10096| [858, 50, 593, 457]|      858 50 593 457|
| 10351|[1, 25, 32, 6, 60...|1 25 32 6 608 52 ...|
| 10436|[661, 107, 60, 1,...|661 107 60 1 919 ...|
|  1090|[356, 597, 919, 986]|     356 597 919 986|
| 11078|[232, 20, 296, 59...|232 20 296 593 45...|
+------+--------------------+--------------------+
only showing top 5 rows

root
 |-- userId: string (nullable = true)
 |-- movieIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- movieIdStr: string (nullable = true)

unique high rating movies:959
<class 'pyspark.sql.dataframe.DataFrame'>
model fitted
48:0.9619194865226746
256:0.9318743348121643
31:0.9131985902786255
186:0.9042068719863892
355:0.

In [10]:
def(movdict,movieids:Row):
    useremb=[0] *10
    movies = movies['movieIds']
    for movieid in movies:
        movEmb = movdict.get('movieid')
        if movEmb:
            useremb=[useremb[i]+movEmb[i] for i in range(10)]
    return useremb

                
    

29776