This project consists of 4 questions:  

1. Create an RDD with _kaggle_visible_evaluation_triplets.txt_ and replace the song name with the song index from _kaggle_songs.txt_. Identify the number of songs that do not have any rating. 
2. Generate song ratings based on the song play count as a normalized score between 0 and 1. 
3. Identify the popular song based on this rating and recommend songs to user, given user id based on the algorithm used in Movie recommender system from class. 
4. Using Cosine similarity function, identify pair-wise similarity between each pair of users and generate the top 5 most similar users without an overlap in users. 

The above list is the higer level idea about the questions. 

In [None]:
import findspark
findspark.init('C:\\apachespark')
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("Songs")
sc = SparkContext(conf = conf)

In [None]:
## Read triplet file into RDD
##['user','song','play count']
triplet_rdd = sc.textFile(r"datasets\\kaggle_visible_evaluation_triplets.txt") \
    .map(lambda line: line.split("\t")) 
triplet_rdd.take(5)

[['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOBONKR12A58A7A7E0', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOEGIYH12A6D4FC0E3', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOFLJQZ12A6D4FADA6', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOHTKMO12AB01843B0', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SODQZCY12A6D4F9D11', '1']]

# Step 1: 
Replace song name with song index and identify the number of songs without user history

In [None]:
## Read song file into RDD
## [song, id]
songs_rdd = sc.textFile(r"datasets\\kaggle_songs.txt") \
    .map(lambda line: line.split(" ")).map(lambda x: (x[0], x[1]))
songs_rdd.take(5)

[('SOAAADD12AB018A9DD', '1'),
 ('SOAAADE12A6D4F80CC', '2'),
 ('SOAAADF12A8C13DF62', '3'),
 ('SOAAADZ12A8C1334FB', '4'),
 ('SOAAAFI12A6D4F9C66', '5')]

In [None]:
## Read users file into RDD
## [user]
users = sc.textFile(r"datasets\\kaggle_users.txt") \
    .map(lambda line: line.split("\t")) 
users.take(5)

[['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d'],
 ['d7083f5e1d50c264277d624340edaaf3dc16095b'],
 ['d68dc6fc25248234590d7668a11e3335534ae4b4'],
 ['9be82340a8b5ef32357fe5af957ccd54736ece95'],
 ['841b2394ae3a9febbd6b06497b4a8ee8eb24b7f8']]

In [None]:
# reorder triplet rdd
hacked_triplet_rdd = triplet_rdd.map(lambda x: (x[1], [x[0], int(x[2])]))
hacked_triplet_rdd.take(5)

[('SOBONKR12A58A7A7E0', ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 1]),
 ('SOEGIYH12A6D4FC0E3', ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 1]),
 ('SOFLJQZ12A6D4FADA6', ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 1]),
 ('SOHTKMO12AB01843B0', ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 1]),
 ('SODQZCY12A6D4F9D11', ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 1])]

In [None]:
hacked_triplet_rdd2 = hacked_triplet_rdd.join(songs_rdd).map(lambda x: (x[1][1], x[1][0]))
hacked_triplet_rdd2.take(10)
# (song_id, [user, play_count])

[('25150', ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 1]),
 ('25150', ['c34670d9c1718361feb93068a853cead3c95b76a', 1]),
 ('25150', ['c5006d9f41f68ccccbf5ee29212b6af494110c5e', 1]),
 ('25150', ['e4332e11f4df6dd26673bb6b085e9a2bbdc9b8a5', 2]),
 ('25150', ['baf2fe5885ab93fbbdb7fecc6691788e70afb6c8', 4]),
 ('25150', ['f6e34f0a68d5ea1344511e33486f956de361db78', 1]),
 ('25150', ['e326c4b9fe3659ec1dc3af53fd7e0893809dafbc', 25]),
 ('25150', ['00f7c493ee64884998ea98d9f5bed87bc4a0afcf', 5]),
 ('25150', ['daa9e7e53ae787ab4f1b5518b695198947d821a2', 1]),
 ('25150', ['cd4321d8fd42ba44996e7f34c2f6404cf5884696', 1])]

In [None]:
hacked_songs_rdd = songs_rdd.map(lambda x: (x[1], x[0]))
triplet_subtracted = hacked_songs_rdd.subtractByKey(hacked_triplet_rdd2)
triplet_subtracted.count()


223007

## Step 2:
Generate song ratings based on the play_count. For example, if (song_1, 5; song_2, 10; song_3, 5) i.e., song_1 is played 5 times, song_2 is played 10 times and song_3 is played 5 times, the normalized rating score should be 0.25, 0.5 and 0.25 respectively. 
Similarly, generate the rating for all the songs. You may notice that based on all songs, the rating is almost always very low. So, think of the best way to convert song count to ratings. (Hint: Try generating ratings based on each user's song play history)

In [None]:
hacked_triplet_rdd3 = hacked_triplet_rdd2.map(lambda x: (x[1][0], [x[1][1], x[0]]))
hacked_triplet_rdd3.take(5)

[('fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', [1, '25150']),
 ('c34670d9c1718361feb93068a853cead3c95b76a', [1, '25150']),
 ('c5006d9f41f68ccccbf5ee29212b6af494110c5e', [1, '25150']),
 ('e4332e11f4df6dd26673bb6b085e9a2bbdc9b8a5', [2, '25150']),
 ('baf2fe5885ab93fbbdb7fecc6691788e70afb6c8', [4, '25150'])]

In [None]:
song_count_rdd = hacked_triplet_rdd2.map(lambda x: (x[1][0], x[1][1]))

In [None]:
# reduce by key, add playcount values, divide by total playcount values, put into songs
sum_of_song_counts = song_count_rdd.reduceByKey(lambda accum, n: int(accum) + int(n))
sum_of_song_counts.collect()

[('e4332e11f4df6dd26673bb6b085e9a2bbdc9b8a5', 54),
 ('f6e34f0a68d5ea1344511e33486f956de361db78', 219),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', 56),
 ('ed199f27a41066e37414c3fe9eefb2ae372b8819', 24),
 ('c1d24ce8cd80e40aa8d803d5ddfceb91a6b5d75d', 15),
 ('c48985d93d590dff33d20094eebc863b0cb455e8', 30),
 ('5f3cd0eac9cdeb89958e3927f0f7b230ada8a24f', 158),
 ('604d906f3fd40383e39c2b75fb6e90f8ee742ca7', 60),
 ('55a653641aa684a958e01a49442c6d4da2badd8d', 94),
 ('07046c62a9dcf4eebd2c979a9847b12fc624d23a', 17),
 ('cd2cfefe1d96ad6685cd5dbf433469a0e541cd02', 29),
 ('2795e5d515d817a5308c2e16ce0cdd2480474f34', 24),
 ('7f1a9d6e8335db6f7f5aa15996c194623a5ab795', 27),
 ('7af67014f661a3aa87072cbbd2dddb7a72e496e0', 21),
 ('ed9d7edf01c1281610eeff3044e7041eb83ae25a', 90),
 ('4df0562280677a09e36838e350c23c3b4cc63844', 79),
 ('efc780aa9c660a83f7c05bc6d75b7c47707d0992', 30),
 ('84b9555110e8de9d2da568afe1fbe56b8684a407', 34),
 ('4a16fd8943913c0268b360bd12f37a4736c2b897', 27),
 ('d1be083ce8898091d4ec8666fd

In [None]:
sum_of_song_counts.values().sum()

4624340

In [None]:
joined_rdd = hacked_triplet_rdd3.join(sum_of_song_counts).map(lambda x: (x[1][0][1], x[1][0][0]/x[1][1]))
joined_rdd.take(5)
# song_id, songcount/totalcount per user

[('25150', 0.4642857142857143),
 ('177172', 0.017857142857142856),
 ('212753', 0.14285714285714285),
 ('25890', 0.017857142857142856),
 ('259912', 0.017857142857142856)]

In [None]:
reduced_rdd = joined_rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).map(lambda x: (x[0], x[1][0]/x[1][1]))
    
reduced_rdd.collect()


[('98924', 0.09733013669292474),
 ('302369', 0.07242217163496913),
 ('170536', 0.18490677612062836),
 ('183796', 0.06976218180987746),
 ('257058', 0.05231647594438291),
 ('252661', 0.06290667740212129),
 ('56963', 0.04191495875118941),
 ('223276', 0.11),
 ('380790', 0.08493987676630632),
 ('199603', 0.12462671509281678),
 ('142221', 0.04938101409072361),
 ('49469', 0.07027205114890966),
 ('235657', 0.06964471275265194),
 ('9307', 0.04731672280461688),
 ('245010', 0.08975312726622793),
 ('14129', 0.08265325804627761),
 ('11631', 0.09246412666257851),
 ('78354', 0.08765729511457539),
 ('21767', 0.06885704577040082),
 ('71024', 0.0771899669756379),
 ('348714', 0.04678376990241397),
 ('327545', 0.07875991260665752),
 ('383583', 0.09860927721885361),
 ('321930', 0.1141739379572337),
 ('193240', 0.11086733076561259),
 ('352870', 0.15013200298725482),
 ('318288', 0.07641684964820467),
 ('154347', 0.08472744988273875),
 ('9817', 0.11591103619899638),
 ('237680', 0.051953748112158055),
 ('26901

## Step 3: 
For a given user_id (choose one by yourselves), rating, recommend 5 other songs from the list. One way to do this is based on another user who liked the same song liked by this user with rating more than the given rating and recommend the 5 songs based on the matched user's rating. 

In [None]:
user_pref_rdd = hacked_triplet_rdd3.join(sum_of_song_counts).map(lambda x: (x[0], x[1][0][1], x[1][0][0]/x[1][1]))
user_pref_rdd.take(20)
# user, song_id, songcount/totalcount per user


[('bcb1e6d620cf522390d5c92bae26936928e0b588', '25150', 0.4642857142857143),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', '177172', 0.017857142857142856),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', '212753', 0.14285714285714285),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', '25890', 0.017857142857142856),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', '259912', 0.017857142857142856),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', '314086', 0.21428571428571427),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', '334240', 0.017857142857142856),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', '105694', 0.017857142857142856),
 ('bcb1e6d620cf522390d5c92bae26936928e0b588', '225548', 0.08928571428571429),
 ('c1d24ce8cd80e40aa8d803d5ddfceb91a6b5d75d', '25150', 0.06666666666666667),
 ('c1d24ce8cd80e40aa8d803d5ddfceb91a6b5d75d', '12985', 0.3333333333333333),
 ('c1d24ce8cd80e40aa8d803d5ddfceb91a6b5d75d', '288653', 0.3333333333333333),
 ('c1d24ce8cd80e40aa8d803d5ddfceb91a6b5d75d', '169415', 0.2),
 ('c

In [None]:
user_id = '0f40e074aab2c5f47b7ddc2277fb0295b5b3a058'
user_rating = 0.05
def get_user_interest(row):
    if ((row[0] == user_id) and (row[2] >= user_rating)):
        return True
    return False
user_interest = user_pref_rdd.filter(get_user_interest).map(lambda line: (line[1], line[0]))
user_interest.collect() # Get the top songs that this user listens to


[('266479', '0f40e074aab2c5f47b7ddc2277fb0295b5b3a058')]

In [None]:
def get_like_user(row):
    if(row[1][0] != user_id):
        return True
    return False

song_user_freq = user_pref_rdd.map(lambda x: (x[1], x[0], x[2])).join(user_interest).filter(get_like_user).map(lambda x: (x[1][0], x[0]))
song_user_freq.take(5) # different users, interested in same song song


[('aeda0f35230e7cb795a98759c06bd976b566834a', '266479'),
 ('103606dcfbc8a2f40e611909a11b4d6ab2cb5b63', '266479'),
 ('9eef7612496886991a5d151d6b477df1f3b111c5', '266479'),
 ('6c4d09b187aa10f6117795412c154dfe0b81ba54', '266479'),
 ('e0ed8c4e81251375e90ddb43da96ebdda475768a', '266479')]

In [None]:
recommended_songs = user_pref_rdd.join(song_user_freq).map(lambda x: (x[1][0], x[1][1])).subtractByKey(user_interest).map(lambda x: x[0])
recommended_songs.take(5)


['321647', '214742', '352646', '131414', '34177']

## Step 4: 
1. Compute cosine similarity between all pairs of users. 
2. Sort the similarity score and print the top-5 similar users. 
3. If the top-5 user set has an user appearing more than once, ignore that pair and take the next best pair from the sorted list. 
4. For a given user_id, identify the top-5 similar users and hence song recommendations from other user's list. 

In [None]:
# Take only 100 unique users for quicker computing
target_users = users.take(100) # get 100 users in list user of list [['u1'], ['u2'], ...]


In [None]:
interested_user_pref_rdd = user_pref_rdd.filter(lambda x: [x[0]] in target_users).map(lambda x: (x[0], [x[1], x[2]]))
#interested_user_pref_rdd.collect() # user as key, [song, their freq] as value for only 100 users


In [None]:
# group by user, so user is the key, value is a list of lists of (song, freq) pairs
interested_user_grouped = interested_user_pref_rdd.groupByKey().map(lambda x : (x[0], list(x[1])))
interested_user_grouped.take(5)
#interested_user_grouped.count() # make sure 100 users


[('248378ac27e1745d6a9d59392b7dc5b02a6186a6',
  [['68212', 0.02857142857142857],
   ['89197', 0.02857142857142857],
   ['87433', 0.05714285714285714],
   ['5165', 0.02857142857142857],
   ['248603', 0.02857142857142857],
   ['307140', 0.02857142857142857],
   ['123630', 0.02857142857142857],
   ['288653', 0.05714285714285714],
   ['166753', 0.02857142857142857],
   ['357396', 0.02857142857142857],
   ['267404', 0.02857142857142857],
   ['221730', 0.02857142857142857],
   ['332526', 0.05714285714285714],
   ['148519', 0.34285714285714286],
   ['313717', 0.02857142857142857],
   ['249703', 0.02857142857142857],
   ['165401', 0.02857142857142857],
   ['81708', 0.05714285714285714],
   ['244143', 0.02857142857142857],
   ['359546', 0.02857142857142857]]),
 ('7dd760d455a99396add9102008f64619f466847f',
  [['14397', 0.03225806451612903],
   ['294908', 0.03225806451612903],
   ['302247', 0.03225806451612903],
   ['126471', 0.3225806451612903],
   ['213822', 0.12903225806451613],
   ['195955', 

In [None]:
#
# user-based filtering
#
import math
def dot(v,w):
    """ v_1*w_1 + ... v_n*w_n"""
    return sum(v_i *w_i
        for v_i,w_i in zip(v,w))
def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

all_cosine_sim = []
for user1 in interested_user_grouped.collect():
    num_songs_u1 = len(user1[1]) # number of songs user1 is interested in
    song_freq_u1 = {} # store key value for song freq for user 1
    for idx in range(num_songs_u1):
        song_freq_u1[user1[1][idx][0]] = user1[1][idx][1]
    songs_u1 = list(song_freq_u1.keys())
   
    cos_to_user1 = []
   
    for user2 in interested_user_grouped.collect():        
        if user1 == user2:
            cos_to_user1.append(1.0)
            continue
       
        num_songs_u2 = len(user2[1]) # number of songs user1 is interested in
        song_freq_u2 = {} # store key value for song freq for user 1
        for idx in range(num_songs_u2):
            song_freq_u2[user2[1][idx][0]] = user2[1][idx][1]
        songs_u2 = list(song_freq_u2.keys())
   
        songs_shared = songs_u1 + list(set(songs_u2) - set(songs_u1))
       
        song_freq_shared = dict.fromkeys(songs_shared)
       
        for song, freq in song_freq_u1.items():
            if song_freq_shared[song] == None:
                song_freq_shared[song] = [freq, 0]
            else:
                song_freq_shared[song][0] = freq
               
        for song, freq in song_freq_u2.items():
            if song_freq_shared[song] == None:
                song_freq_shared[song] = [0, freq]
            else:
                song_freq_shared[song][1] = freq
           
        user1_freq_vector = [freq[0] for song, freq in song_freq_shared.items()]
        user2_freq_vector = [freq[1] for song, freq in song_freq_shared.items()]
       
        cos_sim = cosine_similarity(user1_freq_vector, user2_freq_vector)
       
        cos_to_user1.append(cos_sim)
          
    all_cosine_sim.append(cos_to_user1)
print(all_cosine_sim)

##Answer to 4.1

[[1.0, 0.0, 0.0, 0.0, 0.01074430618700507, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005671904765548172, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.046111233416338784, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.035525354785040215, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030170187309256136, 0.0024381633528165504, 0.010216188884775934, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01139605764596379, 0.0, 0.0, 0.0, 0.02519763153394847, 0.0, 0.0, 0.0, 0.03466578627692475, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02279211529192758, 0.0, 0.011902794012872307, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12344267996967347, 0.0, 0.0966987556830456, 0.0, 0.01551133468658962, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02530047031200279, 0.0, 0.0, 0.007220953837871386, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04244763599780089, 0.0, 0.0, 0.0, 0.0, 

In [None]:
## Sort the similarity score and print the top-5 similar users.

def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(all_cosine_sim[user_id])            # nonzero
             if user_id != other_user_id and similarity > 0]  # similarity

    return sorted(pairs,                              # sort them
                  key=lambda pair: pair[1],           # most similar
                  reverse=True)                       # first

most_similar_users_to(0)

[(92, 0.12344267996967347),
 (94, 0.0966987556830456),
 (21, 0.046111233416338784),
 (29, 0.035525354785040215),
 (74, 0.03466578627692475),
 (50, 0.030170187309256136),
 (70, 0.02519763153394847),
 (80, 0.02279211529192758),
 (96, 0.01551133468658962),
 (82, 0.011902794012872307),
 (66, 0.01139605764596379),
 (4, 0.01074430618700507),
 (52, 0.010216188884775934),
 (13, 0.005671904765548172),
 (51, 0.0024381633528165504)]

In [None]:
dict_cos_sim = {}
for user in range(100):
#     print(user)
    # list of similar other user and score to user
    similar = most_similar_users_to(user)
    num_similar = len(similar) # num of other user to user
    for sim in range(num_similar): # each other user record
        dict_cos_sim[(user, similar[sim][0])] = similar[sim][1]

print(dict_cos_sim)

{(0, 92): 0.12344267996967347, (0, 94): 0.0966987556830456, (0, 21): 0.046111233416338784, (0, 29): 0.035525354785040215, (0, 74): 0.03466578627692475, (0, 50): 0.030170187309256136, (0, 70): 0.02519763153394847, (0, 80): 0.02279211529192758, (0, 96): 0.01551133468658962, (0, 82): 0.011902794012872307, (0, 66): 0.01139605764596379, (0, 4): 0.01074430618700507, (0, 52): 0.010216188884775934, (0, 13): 0.005671904765548172, (0, 51): 0.0024381633528165504, (1, 35): 0.04244763599780089, (1, 62): 0.030015011259383207, (1, 14): 0.02530047031200279, (1, 70): 0.012253577034896794, (1, 17): 0.007220953837871386, (2, 6): 0.049405378562281374, (2, 50): 0.030083171591179698, (2, 43): 0.01767400220949067, (2, 80): 0.016042149975047403, (2, 13): 0.007984300336119845, (2, 16): 0.00793145122498055, (2, 81): 0.003743546482619114, (4, 70): 0.17766726362967536, (4, 80): 0.1499923475244192, (4, 27): 0.08416546361568651, (4, 69): 0.05415303610738823, (4, 0): 0.010744306187005066, (6, 2): 0.04940537856228138

In [None]:
get_top_five_dict = dict_cos_sim.copy()

for outcome in range(5):
    top = max(get_top_five_dict, key=get_top_five_dict.get)
    print(top, "similarity_score ", get_top_five_dict[top])
    get_top_five_dict.pop(top)

(35, 62) similarity_score  0.4714045207910316
(62, 35) similarity_score  0.4714045207910316
(14, 35) similarity_score  0.46358632497276525
(35, 14) similarity_score  0.4635863249727652
(79, 94) similarity_score  0.34891253432906383


In [None]:
#  3. If the top-5 user set has a user appearing more than once, 
# ignore that pair and take the next best pair from the sorted list. 

get_utop_five_dict = dict_cos_sim.copy()
top_user = []
outcome = 0

while outcome < 5:
    top = max(get_utop_five_dict, key=get_utop_five_dict.get)
    if (top[0] not in top_user) & (top[1] not in top_user):
        top_user.append(top[0])
        top_user.append(top[1])
        print(top, "similarity_score ", get_utop_five_dict[top])
        get_utop_five_dict.pop(top)
        outcome += 1
    else:
        get_utop_five_dict.pop(top)

(35, 62) similarity_score  0.4714045207910316
(79, 94) similarity_score  0.34891253432906383
(73, 52) similarity_score  0.2802828078896166
(13, 14) similarity_score  0.2689621576467759
(69, 75) similarity_score  0.24082334178648618


In [None]:
# 4. For a given user_id, identify the top-5 similar users and hence song recommendations from other user's list. 
hundred_user_list = interested_user_grouped.map(lambda x: x[0]).collect()
# print(hundred_user_list)

for similar in most_similar_users_to(2)[:5]:
    user_name = hundred_user_list[similar[0]]
    song_recs = interested_user_grouped.filter(lambda x: x[0] == user_name).map(lambda x: x[1]).map(lambda x: x[0][0]).collect()
    print(song_recs)

['20811']
['25150']
['25150']
['333259']
['25150']


In [None]:
# 4. For a given user_id, identify the top-5 similar users and hence song recommendations from other user's list.
hundred_user_list = interested_user_grouped.map(lambda x: x[0]).collect()
# print(hundred_user_list)

for similar in most_similar_users_to(0)[:5]:
    user_name = hundred_user_list[similar[0]]
    print(user_name)
    song_recs_from_other = interested_user_grouped.filter(lambda x: x[0] == user_name).flatMap(lambda x: x[1]).collect()
    print([row[0] for row in song_recs_from_other])


fd50c4007b68a3737fe052d5a4f78ce8aa117f3d
['25150', '68212', '87433', '123630', '58821', '351764']
18ce1da0e1017e31baaa5f80afa64ee3c7fab379
['236518', '307202', '161257', '171752', '301621', '50196', '288653', '283134']
2baf0a17a2f805360da628f538bd3b451e73f048
['202995', '368737', '184517', '295513', '87433', '302975', '28629', '281075', '227902', '320114']
b6148aeab635574708f4d13d7788b1b34ff98638
['16220', '108132', '272814', '348033', '354193', '111713', '332572', '87433', '307268', '363167', '323952', '21512', '111830', '334505', '239277', '240551', '345109', '105714', '160403', '176240', '180903', '150084', '19682', '330158', '113067', '321786', '82718', '131893', '308012', '341674', '177419', '131776', '69034', '65502', '172126', '88391', '178499', '316723', '270100']
bdbf8ddd82fa83ef4538a15298dfca19bfc4a3ca
['144605', '292298', '205995', '238809', '314350', '328474', '123630', '91177', '288653', '339795', '259912', '167661', '115196', '277613', '309748', '350973', '191533', '36223