In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

import scipy.sparse as sparse

from implicit.als import AlternatingLeastSquares

## Рекомендации музыки пользователю на основе предпочтений ##

Датасет: `lastfm-dataset-360K`

В качестве входных данных используется набор любимых исполнителей пользователя:

In [2]:
user_fav_artists = ['metallica', 'def leppard', 'cinderella', 'tesla', 'axel rudi pell', 'red hot chili peppers', 'crazy town']

---
Загружаем и смотрим данные

---

In [3]:
raw_data = pd.read_table('usersha1-artmbid-artname-plays.tsv', header=None)

In [4]:
raw_data.head()

Unnamed: 0,0,1,2,3
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [5]:
raw_data.drop(raw_data.columns[1], axis=1, inplace=True)

In [6]:
raw_data.head()

Unnamed: 0,0,2,3
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [7]:
raw_data.columns = ['user', 'artist', 'plays']
raw_data.head()

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [8]:
raw_data.shape

(17535655, 3)

In [9]:
raw_data.isnull().any()

user      False
artist     True
plays     False
dtype: bool

In [10]:
raw_data.isnull().sum(axis=0)

user        0
artist    204
plays       0
dtype: int64

In [11]:
raw_data.dropna(inplace=True)
raw_data.shape

(17535451, 3)

### Преобразуем данные ###

Присвоим каждому пользователю численный идентификатор

Проиндексируем исполнителей (также присвоим каждому численный идентификатор)

In [12]:
data = raw_data.copy()
data['user_id'] = data['user'].astype("category").cat.codes
data['artist_id'] = data['artist'].astype("category").cat.codes

In [13]:
data.head(4)

Unnamed: 0,user,artist,plays,user_id,artist_id
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,0,45561
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,0,90933
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,0,185367
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,0,106704


---
Создадим таблицу соответствия для исполнителей и их идентификаторов, а также функции для поиска id по названию и наоборот

---

In [14]:
artist_name_id = data[['artist_id', 'artist']].drop_duplicates()
artist_name_id.head(6)

Unnamed: 0,artist_id,artist
0,45561,betty blowtorch
1,90933,die Ärzte
2,185367,melissa etheridge
3,106704,elvenking
4,155241,juliette & the licks
5,220128,red hot chili peppers


In [15]:
def get_artist_by_index(index: int) -> str:
    return artist_name_id.loc[artist_name_id.artist_id == index].artist.values[0]

def get_index_by_artist(artist: str) -> int:
    return artist_name_id.loc[artist_name_id.artist == artist].artist_id.values[0]

In [16]:
get_artist_by_index(106704), get_index_by_artist('elvenking')

('elvenking', 106704)

---
Убираем строковые колонки из таблицы data, удаляем строки с нулевым значением plays

---

In [17]:
data.head()

Unnamed: 0,user,artist,plays,user_id,artist_id
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,0,45561
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,0,90933
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,0,185367
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,0,106704
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,0,155241


In [18]:
data.drop(columns=['user', 'artist'], inplace=True)
data.head()

Unnamed: 0,plays,user_id,artist_id
0,2137,0,45561
1,1099,0,90933
2,897,0,185367
3,717,0,106704
4,706,0,155241


In [19]:
data.shape

(17535451, 3)

In [20]:
data = data.loc[data.plays != 0]

In [21]:
data.shape

(17535450, 3)

### Добавляем информацию о новом пользователе ###

In [22]:
new_user_id = max(data.user_id) + 1
new_user_id

358868

In [23]:
mean_plays = int(data.plays.mean())
mean_plays

215

In [24]:
for artist in user_fav_artists:
    try:
        data = data.append({
            'plays': mean_plays,
            'user_id': new_user_id,
            'artist_id': get_index_by_artist(artist)
        }, ignore_index=True)
    except Exception as e:
        continue

In [25]:
data.tail()

Unnamed: 0,plays,user_id,artist_id
17535452,215,358868,71063
17535453,215,358868,251132
17535454,215,358868,36208
17535455,215,358868,220128
17535456,215,358868,77337


### Обучаем модель ###

In [26]:
users_count = data.user_id.nunique()
artists_count = data.artist_id.nunique()
plays = list(data.plays)
rows = data.artist_id.astype(int)
cols = data.user_id.astype(int)

data_sparse = sparse.csr_matrix((plays, (rows, cols)), shape=(artists_count, users_count))

In [27]:
model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)

100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [02:24<00:00, 10.27s/it]


In [28]:
recommendations = model.recommend(new_user_id, data_sparse.T.tocsr(), N=50)
recommendations

[(230495, 0.8993146),
 (161714, 0.8860271),
 (129857, 0.8754324),
 (206214, 0.8691298),
 (142586, 0.85427266),
 (9197, 0.8535024),
 (49208, 0.84912527),
 (12031, 0.84273326),
 (129849, 0.84192705),
 (52979, 0.83993864),
 (154684, 0.8379755),
 (18693, 0.8174845),
 (192830, 0.8151537),
 (195274, 0.8042606),
 (185031, 0.80384886),
 (91718, 0.7912171),
 (199842, 0.7860447),
 (274837, 0.7854606),
 (217043, 0.78249913),
 (87333, 0.7701795),
 (168178, 0.7649892),
 (212496, 0.76241773),
 (236762, 0.7611675),
 (252494, 0.7548213),
 (179170, 0.7497447),
 (134089, 0.7490611),
 (110708, 0.7464452),
 (77435, 0.7403783),
 (261945, 0.73673373),
 (247942, 0.73575294),
 (198787, 0.7333194),
 (272564, 0.7311884),
 (218576, 0.7308495),
 (279580, 0.7270806),
 (187011, 0.7245776),
 (199270, 0.72210884),
 (207161, 0.719954),
 (255131, 0.7150874),
 (213715, 0.71320015),
 (180709, 0.7129373),
 (135668, 0.7126741),
 (131339, 0.7101989),
 (237274, 0.70886654),
 (106764, 0.7080313),
 (126493, 0.7072339),
 (16345

---
Выводим результат в читаемом формате

---

In [29]:
print('Рекомендации 50 исполнителей для пользователя:\n')
for r in recommendations:
    print(f'{get_artist_by_index(r[0]):<30}{r[1]:.2f}')

Рекомендации 50 исполнителей для пользователя:

scorpions                     0.90
kiss                          0.89
guns n' roses                 0.88
ozzy osbourne                 0.87
iron maiden                   0.85
ac/dc                         0.85
black sabbath                 0.85
aerosmith                     0.84
guns n roses                  0.84
bon jovi                      0.84
judas priest                  0.84
alice cooper                  0.82
motörhead                     0.82
mötley crüe                   0.80
megadeth                      0.80
dio                           0.79
nirvana                       0.79
van halen                     0.79
queen                         0.78
deep purple                   0.77
led zeppelin                  0.76
pink floyd                    0.76
skid row                      0.76
the beatles                   0.75
manowar                       0.75
helloween                     0.75
europe                        0.75
creed  