In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.spatial import distance_matrix
import gensim.downloader
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
import pickle
from  tqdm import tqdm

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## знакомство с данными 
#### The format is the following (user_id, song_id, play_count) triplets,

In [None]:
triplets = pd.read_csv('data/P02. MySpotify/train_triplets.txt', sep='\t',\
header = None, names = ['user_id', 'song_id', 'play_count'])
# triplets = reduce_mem_usage(triplets)
triplets.head()

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


#### This file stored in a sparse format. It contains track_id, mxm_track_id, then word count for each of the top words, comma-separated. 

``` TRAINING SET
 MusiXmatch dataset, the official lyrics dataset
 of the Million Song Dataset
    file created on Tue Mar 29 04:28:44 2011
    contact: T. Bertin-Mahieux (Columbia University)
             tb2332@columbia.edu
    also: http://labrosa.ee.columbia.edu/millionsong/musixmatch
          http://www.musixmatch.com
 FORMAT:
     #   - comment, to ignore
     %   - list of top words, comma-separated
         - normal line, contains track_id, mxm track id,
           then word count for each of the top words, comma-separated
           word count is in sparse format -> ...,<word idx>:<cnt>,...
           <word idx> starts at 1 (not zero!)
 All our work is done using UTF-8 encoding.
 enjoy!
 ```

In [None]:
mxm_dataset = pd.read_csv('data/P02. MySpotify/mxm_dataset_train.txt', sep=',')
# mxm_dataset = reduce_mem_usage(mxm_dataset)
mxm_dataset

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,TRVNYYP128F149293F,2260161,1:303,2:170,3:192,4:117,5:148,6:184,7:64,8:147,...,4526:2,4582:1,4746:1,4756:2,4769:2,4773:1,4836:1,4849:2,4878:2,4952:2
0,TRAAAHZ128E0799171,1619153,1:39,2:30,3:10,4:10,5:28,6:21,7:1,8:20,...,,,,,,,,,,
1,TRAAAED128E0783FAB,2516445,1:28,2:15,3:2,4:12,5:22,6:2,7:2,8:4,...,,,,,,,,,,
2,TRAAAAV128F421A322,4623710,1:6,2:4,3:2,4:2,5:5,6:3,7:1,8:1,...,,,,,,,,,,
3,TRAAABD128F429CF47,6477168,1:10,3:17,4:8,5:2,6:2,7:1,8:3,9:2,...,,,,,,,,,,
4,TRAAAEF128F4273421,3759847,1:5,2:4,3:3,4:2,5:1,6:11,9:4,12:9,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210513,TRZZZWS128F429CF87,3080645,6:1,24:9,38:7,42:1,77:4,90:1,112:5,114:4,...,,,,,,,,,,
210514,TRZZZXA128F428ED56,2344272,1:1,2:13,3:6,4:5,5:4,6:6,9:4,10:4,...,,,,,,,,,,
210515,TRZZZXV128F4289747,1417347,1:13,2:3,3:17,4:5,5:9,7:8,8:2,9:1,...,,,,,,,,,,
210516,TRZZZYV128F92E996D,6849828,1:10,2:6,3:20,5:2,7:30,8:1,9:6,10:3,...,,,,,,,,,,


#### Tagtraum genre annotations for the Million Song Dataset.


```
############################################################################################
#    Song-level genre labels for MSD songs based on majority vote (or consensus for n=2).
#    Format: '#'  = comment
#            '\t' = field separator
#            '\n' = line separator
#    Fields: trackId, majority-genre, minority-genre?
############################################################################################ 
```

In [None]:
tagtraum = pd.read_csv('data/P02. MySpotify/p02_msd_tagtraum_cd2.cls',\
sep='\t', names=['track_id', 'majority_genre', 'minority_genre'], header=None)
tagtraum

Unnamed: 0,track_id,majority_genre,minority_genre
0,TRAAAAK128F9318786,Rock,
1,TRAAAAW128F429D538,Rap,
2,TRAAABD128F429CF47,Rock,RnB
3,TRAAADJ128F4287B47,Rock,
4,TRAAADZ128F9348C2E,Latin,
...,...,...,...
280826,TRZZZRJ128F42819AF,Rock,
280827,TRZZZUK128F92E3C60,Folk,
280828,TRZZZYV128F92E996D,New Age,RnB
280829,TRZZZZD128F4236844,Rock,


#### The mapping between track ids and song ids.

In [None]:
unique_tracks = pd.read_csv('data/P02. MySpotify/p02_unique_tracks.txt', sep='<SEP>',\
header=None, names = ['track_id', 'song_id', 'artist', 'title'])
unique_tracks

  return func(*args, **kwargs)


Unnamed: 0,track_id,song_id,artist,title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens
...,...,...,...,...
999995,TRYYYUS12903CD2DF0,SOTXAME12AB018F136,Kiko Navarro,O Samba Da Vida
999996,TRYYYJO128F426DA37,SOXQYIQ12A8C137FBB,Kuldeep Manak,Jago Chhadeo
999997,TRYYYMG128F4260ECA,SOHODZI12A8C137BB3,Gabriel Le Mar,Novemba
999998,TRYYYDJ128F9310A21,SOLXGOR12A81C21EB7,Elude,Faraday


## Implementation

### Top-250 tracks
It should return a dataframe with the following fields: index number, artist name, track
title, play count. The table should be sorted by the play count descendingly.

одному song_id может принадлежать несколько track_id,но по сутии  разные track_id  ссылаются на одого и того же исполнителя

In [None]:
pop_song = triplets[['song_id', 'play_count']].groupby('song_id')['play_count'].sum()
pop_song = pop_song.reset_index()
sor_pop_song = pop_song.sort_values(by = 'play_count', ascending = False)
top_250 = sor_pop_song.iloc[:250]
top_250.head()

Unnamed: 0,song_id,play_count
25043,SOBONKR12A58A7A7E0,726885
12936,SOAUWYT12A81C206F1,648239
287415,SOSXLTC12AF72A7F54,527893
90798,SOFRQTD12A81C233C0,425463
67917,SOEGIYH12A6D4FC0E3,389880


In [None]:
pop_song_unique_tracks = pd.merge(pop_song, unique_tracks, how='left', on = 'song_id')
pop_song_unique_tracks = pop_song_unique_tracks.sort_values(by = 'play_count', ascending = False)
play_count_track_id = pop_song_unique_tracks[~pop_song_unique_tracks['song_id'].duplicated()]
play_count_track_id = play_count_track_id.rename(columns={'artist':'artist name', 'title':'track title'})

In [None]:
top_250_unique_tracks = play_count_track_id.iloc[:250]
top_250_tracks = top_250_unique_tracks[['artist name', 'track title', 'play_count']]\
.sort_values('play_count', ascending = False).reset_index().drop('index', axis=1)
top_250_tracks.head()

Unnamed: 0,artist name,track title,play_count
0,Dwight Yoakam,You're The One,726885.0
1,Björk,Undo,648239.0
2,Kings Of Leon,Revelry,527893.0
3,Harmonia,Sehr kosmisch,425463.0
4,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,389880.0


In [None]:
top_250_tracks.tail()

Unnamed: 0,artist name,track title,play_count
245,Triple Six Mafia,Now I'm High_ Really High,35253.0
246,The Red Jumpsuit Apparatus,Face Down (Album Version),35245.0
247,Linkin Park,New Divide (Album Version),35191.0
248,Selena Gomez & The Scene,Naturally,35074.0
249,Creedence Clearwater Revival,Have You Ever Seen The Rain,34831.0


### Top-100 tracks by genre
It should return on a given genre a dataframe with the following fields: index number,
artist name, track title, play count. The table should be sorted by the play count descendingly.
You should only use the major genre to perform the subtask.

In [None]:
play_count_track_id_genre = pd.merge(play_count_track_id, tagtraum, on='track_id', how = 'left')
play_count_track_id_genre.head()

Unnamed: 0,song_id,play_count,track_id,artist name,track title,majority_genre,minority_genre
0,SOBONKR12A58A7A7E0,726885,TRAEHHJ12903CF492F,Dwight Yoakam,You're The One,Country,
1,SOAUWYT12A81C206F1,648239,TRGXQES128F42BA5EB,Björk,Undo,Rock,Electronic
2,SOSXLTC12AF72A7F54,527893,TRONYHY128F92C9D11,Kings Of Leon,Revelry,Rock,
3,SOFRQTD12A81C233C0,425463,TRDMBIJ128F4290431,Harmonia,Sehr kosmisch,Rock,Metal
4,SOEGIYH12A6D4FC0E3,389880,TRLGMFJ128F4217DBE,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,,


In [None]:
# play_count_track_id_genre = play_count_track_id_genre[~play_count_track_id_genre['majority_genre'].isna()].drop_duplicates()

In [None]:
# play_count_track_id_genre = play_count_track_id_genre.drop_duplicates()

In [None]:
# res = play_count_track_id_genre.groupby(['artist name', 'track title']).agg({'play_count':'sum', 'majority_genre': 'last'}).reset_index()

In [None]:
genre = 'Rock'
datas = play_count_track_id_genre
if genre not in datas['majority_genre'].unique():
    raise ValueError('нет такого названия')
one_genre = datas[datas['majority_genre'] == genre]\
[['artist name', 'track title', 'play_count']]
top_100_genre = one_genre.sort_values('play_count', ascending = False).iloc[:100]
top_100_genre = top_100_genre.reset_index().drop('index', axis=1)
top_100_genre.head()

Unnamed: 0,artist name,track title,play_count
0,Björk,Undo,648239
1,Kings Of Leon,Revelry,527893
2,Harmonia,Sehr kosmisch,425463
3,OneRepublic,Secrets,292642
4,Tub Ring,Invalid,268353


In [None]:
top_100_genre.tail()

Unnamed: 0,artist name,track title,play_count
95,Train,Drops Of Jupiter,26547
96,Skillet,Hero (Album Version),26542
97,Queen,Love Of My Life (1993 Digital Remaster),26393
98,Blind Melon,No Rain,26306
99,Soda Stereo,Observándonos (Satélites),26291


### Collections

 Content-based approach. 50 songs about love, 50 songs about war, 50 songs about happiness, 50
songs about loneliness, 50 songs about money.


It should return on a given keyword (love, war, happiness) a dataframe (50 tracks) with
the following fields: index number, artist name, track title, play count. The table should be
sorted by the play count descendingly. Try different approaches to these recommendations:


• baseline - when you look for the keyword and the number of its occurrences in a
song, filter using some threshold and then sorting it by the play count,

In [1]:
keyword = 'war'
keyword = {'happiness': 'happi', 'loneliness': 'lone'}.get(keyword, keyword)
keywords = ['love', 'war', 'happi', 'lone', 'money']
if keyword not in keywords:
    raise ValueError('нет такого ключевого слова')

In [None]:
# with open('data/P02. MySpotify/mxm_dataset_train_header.txt') as file:
#     keyword_number = {}
#     file = list(file)[0].split(',')
#     for en, word in enumerate(file):
#         if word in keywords:
#             keyword_number[word] = en + 1
keyword_number = {'love': 27, 'happi': 355, 'war': 361, 'lone': 367, 'money': 409}

In [None]:
data = mxm_dataset.values

In [None]:
num = str(keyword_number[keyword])
ans = []
for line in data:
    for i in line[2:]:
        if not isinstance(i, str):
            break
        mas = i.split(':')
        if mas[0] == num:
            ans.append([line[0], int(mas[1])])
            break

In [None]:
def get_top_50_keyword(ans):
    ans_filt = sorted(ans, key = lambda x: x[1] ,reverse=True)[:100]
    track_keyword = pd.DataFrame(ans_filt, columns=['track_id', '-'])['track_id']
    all_tabell_keyword = pd.merge(track_keyword, play_count_track_id_genre, how = 'inner', on = 'track_id')
    
    top_50_keyword = all_tabell_keyword.sort_values('play_count', ascending = False).iloc[:50]
    top_50_keyword = top_50_keyword.reset_index().drop('index', axis=1)[['artist name', 'track title', 'play_count']]
    return top_50_keyword

In [None]:
get_top_50_keyword(ans)

Unnamed: 0,artist name,track title,play_count
0,Guns N' Roses,Civil War,22893.0
1,Kanye West,The Glory,3934.0
2,Cobra Starship,The City Is At War (Album Version),3480.0
3,Iration,Love/Hate,2370.0
4,Rage Against The Machine,Township Rebellion,1644.0
5,Cat Power,He War,1369.0
6,Stars,In Our Bedroom After The War,1172.0
7,Story Of The Year,Welcome To Our New War,664.0
8,Iron Maiden,The Aftermath,578.0
9,Burzum,War,564.0


• word2vec - when you look not only for the keyword but for several similar tokens
as well using word2vec,

In [None]:
with open('data/P02. MySpotify/mxm_dataset_train_header.txt') as file:
    file = list(file)[0].split(',')
    word_number = {}
    for en, word in enumerate(file):
        word_number[str(en + 1)] = word

In [None]:
corpus = gensim.downloader.load('text8')
model = Word2Vec(corpus)
keyword_vec = model.wv[keyword]

In [None]:
all_mas = []
for line in data:
    mas = [line[0]]
    for i in line[2:]:
        if not isinstance(i, str):
            break
        ints = i.split(':')[0]
        try:
            res = model.wv[word_number[ints]]
        except:
            res = 0
        mas.append(res)
    all_mas.append(mas)

In [None]:
sum_cos = [[line[0], sum(map(lambda x: 1 - cosine(x, keyword_vec), line[1:]))] for line in all_mas]

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [None]:
# with open('./data/P02. MySpotify/w2v_collect.pickle', 'wb') as f:
#     pickle.dump(all_mas, f)

In [None]:
get_top_50_keyword(sum_cos).head()

Unnamed: 0,artist name,track title,play_count
0,Black Eyed Peas,Let's Get It Started,20511
1,Tracy Chapman,Fast Car (LP Version),16475
2,Big Drill Car,The Shake,5184
3,Man Man,The Ballad Of Butter Beans,3276
4,The Kooks,One Last Time,2848


### People similar to you listen 
Collaborative filtering approach.

For these recommendations, you need to use the train/test split approach. In this case,
the best practice is to cut a sub-matrix from the user-item matrix for the test dataset
and the other parts to use for the train.

To assess your recommendations use the metric p@k (precision at k). It shows the
percentage of the correct recommendations from your list. It means, that if you gave
a user 10 tracks to listen and if they liked 3 of them (they really listen to them in the
test dataset), then the p@k will be equal to 30%. Calculate the average p@k for your
recommendations. It should be at least greater than 10%.

The script should return 10 recommendations for a given user in a dataframe: index
number, artist name, track title. The table should be sorted descendingly by the “likelihood”
that any given user will “like” the track.


In [None]:
triplets

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
48373581,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
48373582,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
48373583,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
48373584,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3


In [None]:
count_song_us= triplets.groupby('song_id')['user_id'].count()

In [None]:
# plt.figure(figsize=(15,10))
topsong = count_song_us[(count_song_us > 10) & (count_song_us < 200)]

In [None]:
toptriplets  = triplets[triplets['song_id'].isin(topsong.index)]
song_id = toptriplets['song_id'].astype('category').cat
toptriplets['song_id_num'] = song_id.codes
toptriplets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toptriplets['song_id_num'] = toptriplets['song_id'].astype('category').cat.codes


Unnamed: 0,user_id,song_id,play_count,song_id_num
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBNZDC12A6D4FC103,1,10822
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBVFZR12A6D4F8AE3,1,12869
16,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEGVZY12A58A7857E,1,30099
17,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEKWEA12A6D4F5DC3,1,31202
18,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEOBYG12A6D4F8AE2,1,32120


In [None]:
len(toptriplets) / len(triplets)

0.17532719199275407

In [None]:
uniq_user_all = toptriplets['user_id'].unique()
train_part = int(uniq_user_all.shape[0] * 0.7)
train_part_us = uniq_user_all[train_part]
index_train = toptriplets[toptriplets['user_id'] == train_part_us].index[0]
triplets_train, triplets_test = toptriplets.loc[:index_train], toptriplets.loc[index_train:]

In [None]:
uniq_song = song_id.categories
uniq_user = triplets_train['user_id'].unique()
max_size = uniq_song.shape[0]
triplets_val = triplets_train.values

In [None]:
max(triplets_val[:, 3])

171386

In [None]:
len(uniq_song)

171387

In [None]:
mas_index = []
start_in = 0
for trip_ind in np.arange(len(triplets_val)):
    if triplets_val[start_in][0] != triplets_val[trip_ind][0]:
        mas_index.append([start_in, trip_ind])
        start_in = trip_ind

In [None]:
%%time 
mas = []
for st_in, fn_in in tqdm(mas_index):
    zer = np.zeros(max_size, dtype = np.int8)
    user_song_id = triplets_val[st_in:fn_in, 3]
    zer[user_song_id.tolist()] = np.int8(1)
    mas.append(zer)
# mas = np.array(mas)

100%|██████████| 589061/589061 [00:15<00:00, 39115.15it/s]

CPU times: user 8.51 s, sys: 6.22 s, total: 14.7 s
Wall time: 15.1 s





In [None]:
triplets_train.loc[triplets_train['user_id'] == 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'].iloc[0]

user_id        b80344d063b5ccb3212f76538f3d9e43d87dca9e
song_id                              SOBNZDC12A6D4FC103
play_count                                            1
song_id_num                                       10822
Name: 5, dtype: object

In [None]:
uniq_song[10822]

'SOXSAPY12AB0183E9B'

In [None]:
triplets_train.loc[triplets_train['user_id'] == 'b80344d063b5ccb3212f76538f3d9e43d87dca9e', 'song_id_num'].values

array([ 10822,  12869,  30099,  31202,  32120,  40600,  42100,  53583,
        53760,  56325,  60556,  74489,  96098, 100981, 103114, 110320,
       116242, 117777, 119850, 128711, 129867, 138476, 140741, 157431,
       166709, 171234], dtype=int32)

In [None]:
val = toptriplets['song_id'].astype('category').cat.categories

In [None]:
val.cat.categories

Index(['SOAAADD12AB018A9DD', 'SOAAAFI12A6D4F9C66', 'SOAAAGK12AB0189572',
       'SOAAAMT12AB018C9C4', 'SOAAANN12A8C14425E', 'SOAAAQN12AB01856D3',
       'SOAABCT12AB0185A57', 'SOAABLG12A6D4F73D2', 'SOAABMP12A6D4F7633',
       'SOAABNE12A8C141154',
       ...
       'SOZZYLW12A8C13C168', 'SOZZYMH12AB0180A51', 'SOZZYRT12A8AE45DDA',
       'SOZZYUB12AB0184FA5', 'SOZZZBY12A8C139058', 'SOZZZFB12A8AE45CDC',
       'SOZZZFW12AB0187AD1', 'SOZZZON12A8C139ED5', 'SOZZZRV12A8C1361F1',
       'SOZZZWN12AF72A1E29'],
      dtype='object', length=171387)

In [None]:
[en for en, i in enumerate(mas[0])  if i == 1 ]

[10822,
 12869,
 30099,
 31202,
 32120,
 40600,
 42100,
 53583,
 53760,
 56325,
 60556,
 74489,
 96098,
 100981,
 103114,
 110320,
 116242,
 117777,
 119850,
 128711,
 129867,
 138476,
 140741,
 157431,
 166709,
 171234]

In [None]:
sum(mas[0]), len(mas), len(mas[0])

(26, 589061, 171387)

In [None]:
unique_users_test = triplets_test['user_id'].unique()

def create_vec()

for user in unique_users_test:
    data_user = triplets_test.loc[triplets_test['user_id'] == 'user']
    count_song_user = len(data_user) // 2
    train_song = data_user.loc[:count_song_user]
    test_song = data_user.loc[count_song_user:]
    
    

NameError: name 'triplets_test' is not defined

In [None]:
# with open("peop_sim.txt", "w") as file:
#     print(mas, file=file)
    
### ядро кибнет на шаге dump
# with open('peop_sim.pickle', 'wb') as f:
#     pickle.dump(mas, f)


# with open('peop_sim.pickle', 'rb') as f:
#     mas = pickle.load(f)




## многопоточность 

In [None]:
# import multiprocessing
# from find_sim import find_sim

https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3

In [None]:
# %%time

# count_proc = 14
# client_number = 4
# linspace = np.linspace(0, len(mas), count_proc + 1).astype('int')
# slices_mas = [(mas[client_number], mas[linspace[i - 1]: linspace[i]]) for i in range(1, len(linspace))]

# pool = multiprocessing.Pool(processes=count_proc)
# res = pool.map(find_sim, slices_mas)

In [None]:
# slices_mas[1]

In [None]:
%%time
client_number = 4
sim_user1 = np.array([cosine(mas[client_number], vector) for vector in mas[:196353]])
sim_user2 = np.array([cosine(mas[client_number], vector) for vector in mas[196353:392706]])
sim_user3 = np.array([cosine(mas[client_number], vector) for vector in mas[392706:]])

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [None]:
sort_ind_1 = np.argsort(sim_user1)[:30]
sort_ind_2 = np.argsort(sim_user2)[:30]
sort_ind_3 = np.argsort(sim_user3)[:30] 

In [None]:
all_index = np.concatenate([sort_ind_1, sort_ind_2 + 196353, sort_ind_3 + 392706])
all_value = np.concatenate([sim_user1[sort_ind_1], sim_user2[sort_ind_2], sim_user3[sort_ind_3]])

In [None]:
sort_ind_all = np.argsort(all_value)[:50]
index_to_mas = all_index[sort_ind_all]
index_to_mas

array([     4,  11631, 468011, 323889, 359807, 506692, 478905, 420514,
       148981, 316007, 471650, 334104, 219695,  37954, 191367, 366335,
       475638, 439452, 400203, 249334, 315136, 348557, 242192, 193527,
       478556,  92048, 263807, 354657, 321269,  51700, 105388, 588347,
       555874, 472175, 397593, 392240, 363743, 260575, 584189, 565134,
       199672, 199117, 191703,  36082,  25391, 151054, 436076, 139367,
       298148, 314480])

In [None]:
# cosine(mas[4], mas[500]) 

In [None]:
ans = np.array([])
for i in index_to_mas:
    ans = np.append(ans, uniq_song[mas[i] - mas[4] == 1])
pred_user = ans[:10]

In [None]:
len(mas)

589061

In [None]:
len(uniq_song[mas[0]]), len(v1)

(171387, 26)

In [None]:
len(mas[0]), len(uniq_song)

(171387, 171387)

In [None]:
v1 = triplets_train.loc[triplets_train['user_id'] == 'b80344d063b5ccb3212f76538f3d9e43d87dca9e', 'song_id'].values
v1

array(['SOBNZDC12A6D4FC103', 'SOBVFZR12A6D4F8AE3', 'SOEGVZY12A58A7857E',
       'SOEKWEA12A6D4F5DC3', 'SOEOBYG12A6D4F8AE2', 'SOFTKSZ12A6D4F5DC5',
       'SOFZFQU12A8C13CAB8', 'SOHQIAG12A8C136F64', 'SOHQZCA12A6D4FB317',
       'SOIAOBY12A8C13BF75', 'SOIQOQT12A8C136F96', 'SOKSIKA12A6D4F5DC7',
       'SONYTAN12A8C13BF88', 'SOOSIVQ12A6D4F8AE0', 'SOPBCSY12A6D4F5DC4',
       'SOQEMEN12A8C13BF8B', 'SORDDVI12A8C136F53', 'SORJNVW12A8C13BF90',
       'SORSAJY12A6D4F7457', 'SOTCPHF12A8C13BF9B', 'SOTHMIK12A8C136FA1',
       'SOUQUBU12AF72A47B3', 'SOVAJXX12A8AE47D5C', 'SOXSPON12A6D4F5DC2',
       'SOZGCUB12A8C133997', 'SOZZHXI12A8C13BF7D'], dtype=object)

In [None]:
v2 = uniq_song[mas[0] == 1]
v2

Index(['SOBNZDC12A6D4FC103', 'SOBVFZR12A6D4F8AE3', 'SOEGVZY12A58A7857E',
       'SOEKWEA12A6D4F5DC3', 'SOEOBYG12A6D4F8AE2', 'SOFTKSZ12A6D4F5DC5',
       'SOFZFQU12A8C13CAB8', 'SOHQIAG12A8C136F64', 'SOHQZCA12A6D4FB317',
       'SOIAOBY12A8C13BF75', 'SOIQOQT12A8C136F96', 'SOKSIKA12A6D4F5DC7',
       'SONYTAN12A8C13BF88', 'SOOSIVQ12A6D4F8AE0', 'SOPBCSY12A6D4F5DC4',
       'SOQEMEN12A8C13BF8B', 'SORDDVI12A8C136F53', 'SORJNVW12A8C13BF90',
       'SORSAJY12A6D4F7457', 'SOTCPHF12A8C13BF9B', 'SOTHMIK12A8C136FA1',
       'SOUQUBU12AF72A47B3', 'SOVAJXX12A8AE47D5C', 'SOXSPON12A6D4F5DC2',
       'SOZGCUB12A8C133997', 'SOZZHXI12A8C13BF7D'],
      dtype='object')

In [None]:
len(v1), len(v2)

(26, 26)

In [None]:
all(v2 == v1)

True

In [None]:
# for i in pred_user:
#     if i in v1:
#         print('ok')

### People who listen to this track usually listen
The same things applied to these recommendations: use train/test split, use p@k. If you
gave a user 10 tracks to listen and if they liked 3 of them (they really listen to them in
the test dataset), then the p@k will be equal to 30%. Calculate the average p@k for your
recommendations. It should be at least greater than 10%.

The script should return 10 recommendations for a given track in a dataframe: index
number, artist name, track title. The table should be sorted descendingly by the “likelihood”
that any given user will “like” the track.

In [None]:
# with open('data/P02. MySpotify/mxm_dataset_train_header.txt', 'r') as file:
#     for i in file:
#         print(len(i.split(',')))

In [None]:
data = mxm_dataset.values[:, 2:]
max_count_word = 5000

In [None]:
%%time 
mas = []
for track in tqdm(data):
    zer = np.zeros(max_count_word)
    for word in track:
        if not isinstance(word, str):
            break
        word_index, word_count = word.split(':')
        zer[int(word_index) - 1] = int(word_count)
    mas.append(zer)
mas = np.array(mas)

100%|██████████| 210518/210518 [00:19<00:00, 10627.90it/s]


CPU times: user 20.7 s, sys: 10.9 s, total: 31.6 s
Wall time: 35.3 s


In [None]:
%%time 
track_number =4
sim_track = np.array([cosine(mas[track_number], vector) for vector in mas])

CPU times: user 9.92 s, sys: 280 ms, total: 10.2 s
Wall time: 10.2 s


In [None]:
sort_ind = np.argsort(sim_track)[:20]
sim_track[sort_ind]

array([0.        , 0.38534386, 0.38647683, 0.38985974, 0.39396279,
       0.39456162, 0.39456162, 0.39633599, 0.40354909, 0.40363339,
       0.40378542, 0.40493178, 0.40937037, 0.41006704, 0.41010269,
       0.41593658, 0.41894095, 0.42228776, 0.42401001, 0.42642596])

In [None]:
mxm_dataset.iloc[sort_ind]['TRVNYYP128F149293F']

4         TRAAAEF128F4273421
205327    TRZIUDQ128F1464C15
165488    TRUJTJR128F425F86C
186626    TRXANPN128F92E51FC
8750      TRBBCOH128F429C165
187904    TRXERRS128F42969E4
202302    TRYZAOO128F428C0A6
80728     TRJYOBL12903CB4A95
152606    TRSTQKV128F42665CA
126517    TRPOLHB128F1480A03
28161     TRDLARA128F9312AAD
182248    TRWLZMH128F92C22A0
103230    TRMSDAZ128F42B9B1C
13538     TRBQNEQ128F92CF2BE
75048     TRJGUAN128F4293F4A
197461    TRYJIBX128F4288E73
75847     TRJJKJO128F423DC9C
192564    TRXTVKN128F92DF5DA
36427     TRELVYH128F4278D29
195030    TRYBSTW128F9337E37
Name: TRVNYYP128F149293F, dtype: object