# Sistema de recomendação

- Também chamados de filtros colaborativos
- Analisa dados passados para compreender comportamento de pessoas
- A recomendação é feita por similaridade de comportamento
- Recomendação baseada usuários ou itens
- Algoritmos de recomendação esperam receber os dados em formato especifico (user_ID, item_ID, score)
- Score também chamado de rating, indica a preferência de um usuário sobre um item. Pode mser valores booleanos, ratings ou mesmo volume de vendas

In [2]:
# imports
from pyspark.ml.recommendation import ALS

In [3]:
spSession = SparkSession.builder.master('local').appName('SpSession').getOrCreate()

In [4]:
ratingsRDD = sc.textFile('5-Arquivos-Cap11/data/user-item.txt')
ratingsRDD.collect()

['1001,9001,10',
 '1001,9002,1',
 '1001,9003,9',
 '1002,9001,3',
 '1002,9002,5',
 '1002,9003,1',
 '1002,9004,10',
 '1003,9001,2',
 '1003,9002,6',
 '1003,9003,2',
 '1003,9004,9',
 '1003,9005,10',
 '1003,9006,8',
 '1003,9007,9',
 '1004,9001,9',
 '1004,9002,2',
 '1004,9003,8',
 '1004,9004,3',
 '1004,9010,10',
 '1004,9011,9',
 '1004,9012,8',
 '1005,9001,8',
 '1005,9002,3',
 '1005,9003,7',
 '1005,9004,1',
 '1005,9010,9',
 '1005,9011,10',
 '1005,9012,9',
 '1005,9013,8',
 '1005,9014,1',
 '1005,9015,1',
 '1006,9001,7',
 '1006,9002,4',
 '1006,9003,8',
 '1006,9004,1',
 '1006,9010,7',
 '1006,9011,6',
 '1006,9012,9']

In [11]:
# convertendo para string
ratingRDD2 = ratingsRDD.map(lambda x: x.split(',')).map(lambda x: (int(x[0]),int(x[1]),float(x[2])))

In [21]:
ratingsDF = spSession.createDataFrame(ratingRDD2,['user','item','rating'])

In [22]:
ratingsDF.show()

+----+----+------+
|user|item|rating|
+----+----+------+
|1001|9001|  10.0|
|1001|9002|   1.0|
|1001|9003|   9.0|
|1002|9001|   3.0|
|1002|9002|   5.0|
|1002|9003|   1.0|
|1002|9004|  10.0|
|1003|9001|   2.0|
|1003|9002|   6.0|
|1003|9003|   2.0|
|1003|9004|   9.0|
|1003|9005|  10.0|
|1003|9006|   8.0|
|1003|9007|   9.0|
|1004|9001|   9.0|
|1004|9002|   2.0|
|1004|9003|   8.0|
|1004|9004|   3.0|
|1004|9010|  10.0|
|1004|9011|   9.0|
+----+----+------+
only showing top 20 rows



In [23]:
# ALS alternate list squares --> algoritmo de sistema de recomendação
als = ALS(rank=10,maxIter=5)
modelo = als.fit(ratingsDF)

In [24]:
modelo.userFactors.orderBy('id').collect()

[Row(id=1001, features=[-1.020338773727417, 0.15884457528591156, 0.43435177206993103, 0.42108067870140076, -0.08316300064325333, -0.22315619885921478, 0.43449509143829346, -0.016361376270651817, -0.09652017056941986, 1.2523633241653442]),
 Row(id=1002, features=[-0.7278647422790527, -0.3039030432701111, -1.4372807741165161, 0.4101669490337372, -0.20219333469867706, 0.30689162015914917, 0.2394382208585739, -0.31496909260749817, -0.42265287041664124, -0.5618031024932861]),
 Row(id=1003, features=[-0.2754066288471222, -0.02016931213438511, -1.2622171640396118, 0.6222057938575745, -0.3387295603752136, 0.6792862415313721, -0.059917986392974854, 0.23600462079048157, -0.24518680572509766, -0.4790261387825012]),
 Row(id=1004, features=[-1.1808589696884155, -0.198384091258049, 0.1094837412238121, 0.714041531085968, -0.03430582955479622, -0.05891014635562897, 0.40418773889541626, -0.05229669436812401, -0.19879166781902313, 0.6823390126228333]),
 Row(id=1005, features=[-0.520517110824585, 0.53832

In [25]:
testeDF = spSession.createDataFrame([(1001,9003),(1001,9004),(1001,9005)],['user','item'])

In [27]:
previsoes = modelo.transform(testeDF).collect()
previsoes

[Row(user=1001, item=9004, prediction=-0.6660881042480469),
 Row(user=1001, item=9005, prediction=-2.7070765495300293),
 Row(user=1001, item=9003, prediction=9.008316993713379)]