# Import Modules

In [1]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

# User Defined Functions

In [45]:
def not_tab(x):
    return x != '\t'

In [46]:
def span(input_list,predicate):
    prefix = []
    rest = []
    for item in input_list:
        if predicate(item):
            prefix.append(item)
        else:
            break
    rest = [x for x in input_list if x not in prefix]
    prefix = ''.join(prefix)
    rest = ''.join(rest)
    return (prefix,rest)

In [47]:
def artist_id_map(line):
    (id, name) = span(line,predicate=not_tab)
    if name in ['',' ',None]:
        return None
    else:
        try:
            return (int(id),name.strip())
        except ValueError,e:
            return None

In [3]:
def alias_map(line):
    tokens = line.split('\t')
    if tokens == []:
        return None
    if tokens[0] in ['',' ',None]:
        return None
    else:
        return (int(tokens[0]),int(tokens[1]))

In [4]:
def prepare_training_data(line):
    userID, artistID, count = map(int,line.split(' '))
    finalArtistID = bArtistAlias.value.get(artistID,artistID)
    return Rating(userID, finalArtistID, count)

# Prepare the Data

In [5]:
rawUserArtistData = sc.textFile("vagrant/profiledata_06-May-2005/user_artist_data.txt")

In [6]:
rawArtistData = sc.textFile("vagrant/profiledata_06-May-2005/artist_data.txt")

In [48]:
artistByID = rawArtistData.map(artist_id_map)

In [8]:
rawArtistAlias = sc.textFile("vagrant/profiledata_06-May-2005/artist_alias.txt")

In [9]:
artistAlias = rawArtistAlias.map(alias_map) \
                            .filter(lambda x: x is not None and None not in x) \
                            .collectAsMap()

# Building a First Model

In [10]:
bArtistAlias = sc.broadcast(artistAlias)

In [11]:
trainData = rawUserArtistData.map(prepare_training_data).cache()

In [12]:
model = ALS.trainImplicit(ratings=trainData, rank=10, iterations=5, lambda_=0.01, alpha=1.0)

# Spot Checking Recommendations

In [16]:
recommendations = model.call("recommendProducts", 2093760, 10)

In [20]:
recommendedProductIDs = map(lambda x: x.product,recommendations)

In [56]:
recommendedProductNames = artistByID.filter(lambda x: x is not None) \
                                    .filter(lambda x: x[0] in recommendedProductIDs) \
                                    .collect()

In [57]:
recommendedProductNames

[(2814, u'50 Cent'),
 (4605, u'Snoop Dogg'),
 (829, u'Nas'),
 (1007614, u'Jay-Z'),
 (1037970, u'Kanye West'),
 (1811, u'Dr. Dre'),
 (1003249, u'Ludacris'),
 (1001819, u'2Pac'),
 (1300642, u'The Game'),
 (6914803, u'Jay-Z and Linkin Park')]