In [1]:
sc

In [2]:
sc.master

'yarn'

In [3]:
from dateutil.parser import parse

In [4]:
import pyspark

In [5]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
import numpy as np

In [7]:
import pandas as pd

In [8]:
from pyspark.sql import Row

In [9]:
import numpy as np
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

# 探索ml_100k数据

### 探索用户数据

In [10]:
user_data = sc.textFile('hdfs://zh:9000/ml-100k/u.user')

In [11]:
user_data.first()

'1|24|M|technician|85711'

### 探索电影数据

In [12]:
movie_data = sc.textFile('hdfs://zh:9000/ml-100k/u.item')

In [13]:
movie_data.first()

'1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0'

In [14]:
movie_data.count()

1682

In [15]:
movie_fields = movie_data.map(lambda x:x.split('|'))

In [16]:
def parse_exception(x):
    try:
        return str(parse(x).year)
    except Exception as e:
        return '1900'

In [17]:
years = movie_fields.map(lambda x:x[2]).map(parse_exception)

In [18]:
years_filtered = years.filter(lambda x:x!='1900')

In [19]:
years_filtered.take(1)

['1995']

### 探索评级数据

In [20]:
rating_data = sc.textFile('hdfs://zh:9000/ml-100k/u.data')

In [21]:
rating_data.first()

'196\t242\t3\t881250949'

In [22]:
num_ratings = rating_data.count()

In [23]:
num_ratings

100000

In [24]:
rating_data = rating_data.map(lambda x:x.split('\t'))

### 提取有效特征

In [25]:
rating_data_raw = sc.textFile('hdfs://zh:9000/ml-100k/u.data')

In [26]:
rating_data = rating_data_raw.map(lambda x:x.split('\t')[0:3])

In [27]:
rating_data.take(2)

[['196', '242', '3'], ['186', '302', '3']]

In [28]:
#rating_data.map(lambda x:np.str(x[0])).take(10)
#验证必须worker和driver有着相同的python包才能跑

In [29]:
rating_data_row = rating_data.map(lambda x:Row(user = int(x[0]),movie = int(x[1]), rating = float(x[2])))

In [30]:
rating_df = spark.createDataFrame(rating_data_row)

### 训练和测试

In [31]:
training_set, test_set =  rating_df.randomSplit([0.8, 0.2])

In [32]:
als = ALS(rank=50, maxIter=15, regParam=0.01, userCol='user', itemCol='movie', ratingCol='rating', seed=None,)

In [33]:
model_fited = als.fit(training_set)

In [34]:
predictions = model_fited.transform(test_set)
predictions

DataFrame[movie: bigint, rating: double, user: bigint, prediction: float]

In [35]:
predictions.take(10)

[Row(movie=148, rating=4.0, user=580, prediction=3.0657060146331787),
 Row(movie=148, rating=5.0, user=332, prediction=4.369940757751465),
 Row(movie=148, rating=2.0, user=244, prediction=3.302136182785034),
 Row(movie=148, rating=3.0, user=234, prediction=1.8688479661941528),
 Row(movie=148, rating=5.0, user=403, prediction=2.57193922996521),
 Row(movie=148, rating=4.0, user=621, prediction=2.678412914276123),
 Row(movie=148, rating=2.0, user=592, prediction=4.078207492828369),
 Row(movie=148, rating=4.0, user=825, prediction=3.6678569316864014),
 Row(movie=148, rating=3.0, user=894, prediction=2.8020336627960205),
 Row(movie=148, rating=4.0, user=393, prediction=4.32522439956665)]

In [36]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [37]:
RMSE = evaluator.evaluate(predictions)

In [38]:
RMSE

nan

* 测试集中存在不存在于训练集的用户或者产品，所谓的冷启动问题

In [39]:
predictions.where("prediction == 'NaN' ").show()

+-----+------+----+----------+
|movie|rating|user|prediction|
+-----+------+----+----------+
| 1645|   4.0| 655|       NaN|
| 1339|   1.0| 181|       NaN|
|  857|   3.0|  13|       NaN|
| 1201|   5.0|  90|       NaN|
| 1574|   1.0| 405|       NaN|
| 1493|   1.0| 279|       NaN|
| 1659|   1.0| 747|       NaN|
|  599|   1.0|   7|       NaN|
| 1626|   1.0| 648|       NaN|
| 1122|   5.0|  60|       NaN|
| 1627|   3.0| 650|       NaN|
| 1341|   1.0| 181|       NaN|
| 1486|   1.0| 279|       NaN|
| 1603|   3.0| 450|       NaN|
| 1321|   4.0| 279|       NaN|
| 1321|   1.0| 181|       NaN|
| 1576|   1.0| 405|       NaN|
| 1649|   3.0| 655|       NaN|
| 1520|   3.0| 314|       NaN|
| 1575|   1.0| 405|       NaN|
+-----+------+----+----------+
only showing top 20 rows



In [40]:
predictions_excep_nan = predictions.where("prediction != 'NaN' ")

In [41]:
RMSE = evaluator.evaluate(predictions_excep_nan)

In [42]:
RMSE

1.2168774745407576

### MAPK

In [43]:
k = 10

In [44]:
rating_data.count()

100000

#### 计算预测结果

In [45]:
pred_top_k = model_fited.recommendForAllUsers(k).rdd.map(lambda x:(x.user, x.recommendations)).\
mapValues(lambda x_list:[(x.movie, x.rating) for x in x_list])

In [46]:
pred_top_k.take(2)  

[(471,
  [(520, 6.291160583496094),
   (479, 6.118393421173096),
   (532, 6.076312065124512),
   (809, 5.879485607147217),
   (134, 5.865858554840088),
   (242, 5.837247848510742),
   (448, 5.803979873657227),
   (191, 5.771254539489746),
   (205, 5.599753379821777),
   (127, 5.5867767333984375)]),
 (463,
  [(716, 5.706809997558594),
   (724, 5.4069695472717285),
   (22, 5.2687764167785645),
   (1062, 5.240994453430176),
   (488, 5.217062950134277),
   (248, 5.166277885437012),
   (19, 5.059110641479492),
   (9, 5.045923233032227),
   (116, 4.999324321746826),
   (660, 4.988350868225098)])]

In [47]:
pred_top_k.persist()

PythonRDD[440] at RDD at PythonRDD.scala:48

In [53]:
pred_top_k.count()

943

#### 实际的排序结果

In [54]:
rating_data.take(2)

[['196', '242', '3'], ['186', '302', '3']]

In [55]:
actual_rating_data = rating_data.map(lambda x:(x[0], (x[1], x[2])))

In [56]:
actual_rating_data.take(2)

[('196', ('242', '3')), ('186', ('302', '3'))]

In [57]:
def list_append(x,y):
    x.append(y)
    return x

In [58]:
actual_rating_data_by_user = actual_rating_data.sortBy(lambda x:x[1][1], ascending=False\
).sortByKey().aggregateByKey([], lambda x, y:list_append(x, y), lambda x, y:x+y)

In [59]:
actual_rating_data_by_user.take(1)

[('190',
  [('117', '4'),
   ('147', '4'),
   ('751', '4'),
   ('300', '4'),
   ('628', '4'),
   ('269', '4'),
   ('222', '4'),
   ('405', '4'),
   ('333', '4'),
   ('354', '4'),
   ('15', '4'),
   ('245', '4'),
   ('273', '4'),
   ('591', '4'),
   ('276', '4'),
   ('544', '4'),
   ('310', '4'),
   ('326', '4'),
   ('148', '4'),
   ('100', '4'),
   ('7', '4'),
   ('717', '3'),
   ('742', '3'),
   ('282', '3'),
   ('118', '3'),
   ('294', '3'),
   ('291', '3'),
   ('748', '3'),
   ('895', '3'),
   ('125', '3'),
   ('281', '3'),
   ('989', '3'),
   ('685', '3'),
   ('121', '3'),
   ('508', '3'),
   ('696', '3'),
   ('546', '3'),
   ('328', '3'),
   ('258', '3'),
   ('826', '3'),
   ('24', '3'),
   ('977', '2'),
   ('974', '2'),
   ('363', '2'),
   ('1313', '2'),
   ('539', '2'),
   ('898', '2'),
   ('823', '2'),
   ('597', '2'),
   ('327', '2'),
   ('930', '2'),
   ('340', '1'),
   ('9', '1'),
   ('237', '5'),
   ('288', '5'),
   ('302', '5'),
   ('471', '5'),
   ('313', '5'),
   ('272',

* sortByKey之后用aggregateByKey导致rating5的在不同machine的结果没有按照顺序来。
* 排序应该是最后，否则中间的操作可能会破坏顺序，可能跟存储位置有关。

In [60]:
actual_rating_data_by_user = actual_rating_data.aggregateByKey([], lambda x, y:list_append(x,y), lambda x, y:x+y)

In [61]:
def list_sorted(a_list, reverse=True):
    a_list.sort(key=lambda x:x[1],  reverse=True)
    return a_list

In [62]:
actual_rating_data_by_user_sorted = actual_rating_data_by_user.mapValues(lambda x:list_sorted(x))

In [63]:
actual_rating_data_by_user_sorted.take(1)

[('190',
  [('302', '5'),
   ('471', '5'),
   ('313', '5'),
   ('272', '5'),
   ('237', '5'),
   ('288', '5'),
   ('117', '4'),
   ('147', '4'),
   ('751', '4'),
   ('300', '4'),
   ('628', '4'),
   ('269', '4'),
   ('222', '4'),
   ('405', '4'),
   ('333', '4'),
   ('354', '4'),
   ('15', '4'),
   ('245', '4'),
   ('273', '4'),
   ('591', '4'),
   ('276', '4'),
   ('544', '4'),
   ('310', '4'),
   ('326', '4'),
   ('148', '4'),
   ('100', '4'),
   ('7', '4'),
   ('717', '3'),
   ('742', '3'),
   ('282', '3'),
   ('118', '3'),
   ('294', '3'),
   ('291', '3'),
   ('748', '3'),
   ('895', '3'),
   ('125', '3'),
   ('281', '3'),
   ('989', '3'),
   ('685', '3'),
   ('121', '3'),
   ('508', '3'),
   ('696', '3'),
   ('546', '3'),
   ('328', '3'),
   ('258', '3'),
   ('826', '3'),
   ('24', '3'),
   ('977', '2'),
   ('974', '2'),
   ('363', '2'),
   ('1313', '2'),
   ('539', '2'),
   ('898', '2'),
   ('823', '2'),
   ('597', '2'),
   ('327', '2'),
   ('930', '2'),
   ('340', '1'),
   ('9',

In [64]:
actual_top_k = actual_rating_data_by_user_sorted.mapValues(lambda a_list:a_list[:k])

* a[:k] 如果a的length<k, return a

In [65]:
actual_top_k.take(100)

[('190',
  [('237', '5'),
   ('288', '5'),
   ('302', '5'),
   ('471', '5'),
   ('313', '5'),
   ('272', '5'),
   ('100', '4'),
   ('7', '4'),
   ('117', '4'),
   ('147', '4')]),
 ('263',
  [('133', '5'),
   ('357', '5'),
   ('199', '5'),
   ('135', '5'),
   ('378', '5'),
   ('141', '5'),
   ('176', '5'),
   ('419', '5'),
   ('416', '5'),
   ('523', '5')]),
 ('110',
  [('313', '5'),
   ('161', '5'),
   ('333', '4'),
   ('12', '4'),
   ('54', '4'),
   ('939', '4'),
   ('651', '4'),
   ('739', '4'),
   ('569', '4'),
   ('307', '4')]),
 ('695',
  [('1024', '5'),
   ('346', '5'),
   ('242', '5'),
   ('991', '5'),
   ('268', '5'),
   ('358', '5'),
   ('319', '5'),
   ('260', '4'),
   ('995', '4'),
   ('270', '4')]),
 ('473',
  [('246', '5'),
   ('475', '5'),
   ('127', '5'),
   ('150', '5'),
   ('273', '5'),
   ('1142', '5'),
   ('268', '5'),
   ('116', '5'),
   ('275', '5'),
   ('9', '5')]),
 ('198',
  [('186', '5'),
   ('127', '5'),
   ('531', '5'),
   ('50', '5'),
   ('474', '5'),
   ('8

#### 准备计算MAPK

In [66]:
def avgPrecisionK(actual, predict, k):
    if len(actual) == 0:
        return 1
    if len(actual)<k:
        k = len(actual)
    score = 0
    numHits = 0
    predict_k = predict[:k]
    for s,i in enumerate(predict_k):
        if i in actual:
            numHits+=1
        score = score + numHits/(s+1)
    return score/k

In [67]:
actual_top_k.count()

943

In [68]:
pred_top_k.count()

943

In [None]:
actual_top_k.join(pred_top_k).take(1)

* 总是报错：Container exited with a non-zero exit code 50 大概是资源不足