In [6]:
## O seguite código só serve para importar o SPARK
## Se der para mudar o Kernal para o SPARK, pode ser retirado

import findspark
findspark.init()
findspark.find()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
%config IPCompleter.greedy=True

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

## Imports

In [7]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *
from pyspark.sql.functions import broadcast
from pyspark.ml.recommendation import ALS
from Lib import random

## <a id='2'> Ler Datasets </a>

In [8]:
rawUserArtistData = sc.textFile("../../profiledata_06-May-2005/user_artist_data.txt")
rawArtistData = sc.textFile("../../profiledata_06-May-2005/artist_data.txt")
rawArtistAlias = sc.textFile("../../profiledata_06-May-2005/artist_alias.txt")

## <a id='2'> Helper Functions </a>

As seguintes funções são usadas depois pelas Main Functions, as principais resposáveis pelo funcinamento do programa.
Por isso, mudámos a ordem em que as funções são apresentadas, metendo as Helper antes das outras.

## buildArtistById

In [9]:
#no livro é usado o flatMap, mas o resultado aqui é identico

def buildArtistByID(rawArtistData):
    artistDF = rawArtistData.map(lambda line: line.split("\t",1) if len(line.split("\t"))>1 else None).filter(lambda value: value is not None).toDF(["id","name"])
    artistDF = artistDF.withColumn("id", artistDF["id"].cast(IntegerType()))
    return artistDF

## buildArtistAlias

In [10]:
def buildArtistAlias(rawArtistAlias):
    artistAliasDF = rawArtistAlias.map(lambda line: line.split("\t",1) if line.split("\t")[0]!="" else None).filter(lambda value: value is not None).toDF(["artist_id","alias"])

    artistAliasDF = artistAliasDF.withColumn("artist_id", artistAliasDF["artist_id"].cast(IntegerType()))
    artistAliasDF = artistAliasDF.withColumn("alias", artistAliasDF["alias"].cast(IntegerType()))
    
    return artistAliasDF

## buildCounts

In [11]:
def buildCounts(rawUserArtistData, bArtistAlias):
    buildDF = rawUserArtistData.map(lambda line: line.split(" ")).toDF(["user","artist","count"])

    buildDF = buildDF.withColumn("user", buildDF["user"].cast(IntegerType()))
    buildDF = buildDF.withColumn("artist", buildDF["artist"].cast(IntegerType()))
    buildDF = buildDF.withColumn("count", buildDF["count"].cast(IntegerType()))
    
    joinExpression = buildDF["artist"] == bArtistAlias["artist_id"]
    joinedDF = buildDF.join(bArtistAlias, joinExpression, "left_outer")
    joinedDF = joinedDF.withColumn("artist", when(joinedDF["alias"].isNotNull(), joinedDF["alias"]).otherwise(joinedDF["artist"])).select("user","artist","count")
    
    return joinedDF

## <a id='2'> Main Functions </a>

Principais funções

## Preparation

In [12]:
def preparation(rawUserArtistData, rawArtistData, rawArtistAlias):
    userArtistDF = rawUserArtistData.map(lambda line: line.split(" ")).toDF(["user","artist"]).select("user", "artist")

    userArtistDF = userArtistDF.withColumn("user", userArtistDF["user"].cast(IntegerType()))
    userArtistDF = userArtistDF.withColumn("artist", userArtistDF["artist"].cast(IntegerType()))
    
    #userArtistDF.select(min("user"), max("user"), min("artist"), max("artist")).show()
    
    artistByID = buildArtistByID(rawArtistData)
    artistAlias = buildArtistAlias(rawArtistAlias)
    
    badID = artistAliasDF.head()[0]
    goodID = artistAliasDF.head()[1]
    
    #artistDF.filter(artistDF.id.isin(badID,goodID)).show()

In [None]:
artistAlias = buildArtistAlias(rawArtistAlias)

In [None]:
artistAlias.filter(artistAlias["alias"].isin(6969903)).show()

## Model

In [13]:
def model(rawUserArtistData, rawArtistData, rawArtistAlias):
    bArtistAlias = broadcast(buildArtistAlias(rawArtistAlias))
    trainData = buildCounts(rawUserArtistData, bArtistAlias)
    
    model = ALS()\
        .setSeed(random.randint(1000000000000000000, 9999999999999999999))\
        .setImplicitPrefs(True)\
        .setRank(10)\
        .setRegParam(0.01)\
        .setAlpha(1.0)\
        .setMaxIter(5)\
        .setUserCol("user")\
        .setItemCol("artist")\
        .setRatingCol("count")\
        .setPredictionCol("prediction")\
        .fit(trainData)
    
    trainData.unpersist()
    
    #model.userFactors.select("features").show(truncate=False)
    
    return model

In [14]:
model_ = model(rawUserArtistData, rawArtistData, rawArtistAlias)

In [15]:
model_.userFactors.show(2,False)

+---+-------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                       |
+---+-------------------------------------------------------------------------------------------------------------------------------+
|90 |[-0.6391638, 0.12591726, -0.6211719, -0.22671117, -0.7001853, -0.5480162, -0.34811047, -0.14221708, -0.0373149, 0.6197387]     |
|120|[0.1910231, 0.076923296, 0.053284623, -0.084045075, -0.038614605, -0.099791445, -0.10314017, 0.18801978, 0.22673032, 0.2317887]|
+---+-------------------------------------------------------------------------------------------------------------------------------+
only showing top 2 rows



Test model:

In [17]:
userID = 2093760
bArtistAlias = broadcast(buildArtistAlias(rawArtistAlias))
trainData = buildCounts(rawUserArtistData, bArtistAlias)
artistByID = buildArtistByID(rawArtistData)

existingArtistIDs = trainData.filter(trainData["user"]==userID).select("artist").collect()

#artistByID.filter(artistByID["id"].isin(existingArtistIDs)).show()

In [19]:
#It works!!
artistByID.filter(artistByID["id"].isin(1180, 1255340, 378, 813, 942)).show()

+-------+---------------+
|     id|           name|
+-------+---------------+
|   1180|     David Gray|
|    378|  Blackalicious|
|    813|     Jurassic 5|
|1255340|The Saw Doctors|
|    942|         Xzibit|
+-------+---------------+



## Evaluate

## Recommend