In [44]:
import pandas as pd
import csv
import math

Создание словаря с рейтингами каждого юзера

In [45]:
def get_rating_dict(filename = "lastfm_user_scrobbles.csv"):
    with open(filename, 'r') as f:
        r = csv.reader(f)
        output = dict()
        for i, line in enumerate(r):
            if i == 0:
                continue
            user    = line[0]
            artist = line[1]
            rate    = int(line[2])
            if not user in output:
                output[user] = dict()
            output[user][artist] = rate
    return output

Реализация через pandas (оказалась медленнее)

In [46]:
def get_rating_dict_pd(df):
    output = dict()
    for i, line in df.iterrows():
        user = str(line['user_id'])
        artist = line['artist_id']
        rate    = int(line['scrobbles'])
        if not user in output:
            output[user] = dict()
        output[user][artist] = rate
    return output

Косинусная мера расстояния

In [47]:
def distCosine (vecA, vecB):
    def dotProduct (vecA, vecB):
        d = 0.0
        for dim in vecA:
            if dim in vecB:
                d += vecA[dim]*vecB[dim]
        return d
    return dotProduct (vecA,vecB) / math.sqrt(dotProduct(vecA,vecA)) / math.sqrt(dotProduct(vecB,vecB))

Поиск схожих артистов

In [48]:
def get_similar_artists(user_id, user_ratings):
    # Find like-minded users
    user_matches = [(u, distCosine(user_ratings[user_id], user_ratings[u])) for u in user_ratings if u != user_id]
    best_user_matches = sorted(user_matches, key=lambda x: x[1], reverse=True)[:1000]

    # normalization coef
    total_similarity = sum([x[1] for x in best_user_matches])
    
    # filtering closer users
    best_user_matches = dict([x for x in best_user_matches if x[1] > 0.25])

    # find top artists
    artist_similarity = dict()
    for related_user in best_user_matches:
        for artist in user_ratings[related_user]:
            if not artist in user_ratings[user_id]:
                if not artist in artist_similarity:
                    artist_similarity[artist] = 0.0
                artist_similarity[artist] += user_ratings[related_user][artist] * best_user_matches[related_user]
    
    # normalization
    for artist in artist_similarity:
        artist_similarity[artist] /= total_similarity
    
    # sort to get most relevant
    top_artists = sorted(artist_similarity.items(), key=lambda x: x[1], reverse=True)[:5]
    return [x[0] for x in top_artists]

In [49]:
df = pd.read_csv('lastfm_user_scrobbles.csv')
names_df = pd.read_csv('lastfm_artist_list.csv')

In [50]:
user_rating = get_rating_dict()
user_rating

{'1': {'4562': 13883,
  '10191': 11690,
  '494': 11351,
  '6673': 10300,
  '8402': 8983,
  '3560': 6152,
  '15398': 5955,
  '6036': 4616,
  '10624': 4337,
  '9618': 4147,
  '14062': 3923,
  '11731': 3782,
  '4950': 3735,
  '12486': 3644,
  '3272': 3579,
  '5206': 3312,
  '9226': 3301,
  '6854': 2927,
  '12591': 2720,
  '10075': 2686,
  '4127': 2654,
  '3992': 2619,
  '2562': 2584,
  '1622': 2547,
  '2606': 2397,
  '4785': 2382,
  '5855': 2120,
  '14270': 2119,
  '5337': 1990,
  '6188': 1972,
  '11707': 1948,
  '9392': 1868,
  '3238': 1792,
  '3513': 1740,
  '13548': 1638,
  '8007': 1594,
  '3828': 1559,
  '6068': 1553,
  '8460': 1519,
  '8309': 1471,
  '10701': 1438,
  '16240': 1411,
  '7335': 1407,
  '10000': 1373,
  '13191': 1363,
  '5436': 1342,
  '4547': 1337,
  '7263': 1332,
  '7044': 1330,
  '263': 1315},
 '2': {'11634': 13176,
  '12897': 662,
  '9660': 493,
  '2799': 431,
  '11600': 403,
  '11637': 354,
  '2503': 269,
  '9261': 236,
  '942': 215,
  '3818': 215,
  '11635': 212,
 

In [51]:
input_artist = 'Britney Spears'
artist_id = (names_df[names_df['artist_name']==input_artist].index + 1).tolist()
fans = df[df['artist_id']== artist_id[0]]
fans.sort_values('scrobbles', ascending=False)
top_fan = fans.iloc[0]

In [52]:
top_artists_id = get_similar_artists(str(top_fan.user_id), user_rating)

In [53]:
artists = []
for index in top_artists_id:
    temp_df = names_df[names_df['artist_id'] == int(index)]
    artists.append(temp_df['artist_name'].iloc[0])
print(artists)

['Ke$Ha', 'Shakira', 'Miley Cyrus', 'Taylor Swift', 'Paramore']
