# Import libraries

In [138]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
import nltk
from nltk.corpus import stopwords
import gensim
import gensim.downloader as api

import os

tqdm.pandas()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Global variables

In [74]:
DATA_DIR = 'data'

UNIQUE_TRACKS_DATA_FILE_NAME = 'p02_unique_tracks.txt'
UNIQUE_TRACKS_DATA_FILE_PATH = os.path.join(DATA_DIR,
                                            UNIQUE_TRACKS_DATA_FILE_NAME)


TRIPLETS_DATA_FILE_NAME = 'train_triplets.txt'
TRIPLETS_DATA_FILE_PATH = os.path.join(DATA_DIR,
                                       TRIPLETS_DATA_FILE_NAME)


MSD_TAGTRAUM_FILE_NAME = 'p02_msd_tagtraum_cd2.cls'
MSD_TAGTRAUM_FILE_PATH = os.path.join(DATA_DIR,
                                      MSD_TAGTRAUM_FILE_NAME)


MXM_FILE_NAME = 'mxm_dataset_train.txt'
MXM_FILE_PATH = os.path.join(DATA_DIR,
                             MXM_FILE_NAME)

# Top-250 tracks

It should return a dataframe with the following fields: index number, artist name, track title, play count. The table should be sorted by
the play count descendingly.

In [3]:
unique_tracks = pd.read_csv(UNIQUE_TRACKS_DATA_FILE_PATH,
                            sep='<SEP>',
                            names=('track_id', 'song_id', 'artist', 'title'),
                            engine='python')
unique_tracks.head()

Unnamed: 0,track_id,song_id,artist,title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [4]:
unique_tracks.shape

(1000000, 4)

In [5]:
triplets = pd.read_csv(TRIPLETS_DATA_FILE_PATH,
                       sep='\t',
                       names=('user_id', 'song_id', 'play_count'),
                       engine='python')
triplets.head()

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [6]:
triplets.shape

(48373586, 3)

In [15]:
songs_play_count = triplets.groupby(by=['song_id'])['play_count'].sum()

In [17]:
songs_play_count = songs_play_count.reset_index()
songs_play_count.head()

Unnamed: 0,song_id,play_count
0,SOAAADD12AB018A9DD,24
1,SOAAADE12A6D4F80CC,12
2,SOAAADF12A8C13DF62,9
3,SOAAADZ12A8C1334FB,12
4,SOAAAFI12A6D4F9C66,188


In [18]:
top_250_tracks = songs_play_count.sort_values(by=['play_count'], ascending=False).head(250)

In [19]:
top_250_tracks = top_250_tracks.reset_index(drop=True)
top_250_tracks.head()

Unnamed: 0,song_id,play_count
0,SOBONKR12A58A7A7E0,726885
1,SOAUWYT12A81C206F1,648239
2,SOSXLTC12AF72A7F54,527893
3,SOFRQTD12A81C233C0,425463
4,SOEGIYH12A6D4FC0E3,389880


In [20]:
artists = [unique_tracks[unique_tracks['song_id'] == track[0]]['artist'].values[0]
           for track in tqdm(top_250_tracks.values)]

  0%|          | 0/250 [00:00<?, ?it/s]

In [21]:
artists[:5]

['Dwight Yoakam',
 'Björk',
 'Kings Of Leon',
 'Harmonia',
 'Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner']

In [22]:
titles = [unique_tracks[unique_tracks['song_id'] == track[0]]['title'].values[0]
           for track in tqdm(top_250_tracks.values)]

  0%|          | 0/250 [00:00<?, ?it/s]

In [23]:
titles[:5]

["You're The One",
 'Undo',
 'Revelry',
 'Sehr kosmisch',
 'Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile)']

In [24]:
top_250_tracks['artist'] = artists
top_250_tracks['title'] = titles
top_250_tracks['id'] = [i for i in range(top_250_tracks.shape[0])]

In [25]:
top_250_tracks.head()

Unnamed: 0,song_id,play_count,artist,title,id
0,SOBONKR12A58A7A7E0,726885,Dwight Yoakam,You're The One,0
1,SOAUWYT12A81C206F1,648239,Björk,Undo,1
2,SOSXLTC12AF72A7F54,527893,Kings Of Leon,Revelry,2
3,SOFRQTD12A81C233C0,425463,Harmonia,Sehr kosmisch,3
4,SOEGIYH12A6D4FC0E3,389880,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,4


In [27]:
cols_order = ['id', 'artist', 'title', 'play_count']
top_250_tracks = top_250_tracks[cols_order]

In [28]:
top_250_tracks.head()

Unnamed: 0,id,artist,title,play_count
0,0,Dwight Yoakam,You're The One,726885
1,1,Björk,Undo,648239
2,2,Kings Of Leon,Revelry,527893
3,3,Harmonia,Sehr kosmisch,425463
4,4,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,389880


In [29]:
top_250_tracks.to_csv('top_250_tracks.csv', index=False)

# Top-100 tracks by genre

It should return on a given genre a dataframe with the following fields: index number, artist name, track title, play count. The table should be sorted by the play count descendingly. You should only use the major genre to perform the subtask.

In [61]:
genres = pd.read_csv(MSD_TAGTRAUM_FILE_PATH,
                     sep='\t',
                     names=('track_id', 'majority-genre', 'minority-genre'),
                     comment='#')

In [62]:
genres.head()

Unnamed: 0,track_id,majority-genre,minority-genre
0,TRAAAAK128F9318786,Rock,
1,TRAAAAW128F429D538,Rap,
2,TRAAABD128F429CF47,Rock,RnB
3,TRAAADJ128F4287B47,Rock,
4,TRAAADZ128F9348C2E,Latin,


In [63]:
genres_tracks = pd.merge(genres, unique_tracks)

In [64]:
genres_tracks.head()

Unnamed: 0,track_id,majority-genre,minority-genre,song_id,artist,title
0,TRAAAAK128F9318786,Rock,,SOBLFFE12AF72AA5BA,Adelitas Way,Scream
1,TRAAAAW128F429D538,Rap,,SOMZWCG12A8C13C480,Casual,I Didn't Mean To
2,TRAAABD128F429CF47,Rock,RnB,SOCIWDW12A8C13D406,The Box Tops,Soul Deep
3,TRAAADJ128F4287B47,Rock,,SOCSNVI12A8C13ECC2,Big Brother & The Holding Company,Heartache People
4,TRAAADZ128F9348C2E,Latin,,SOXVLOJ12AB0189215,Sonora Santanera,Amor De Cabaret


In [65]:
genres_tracks_play_count = pd.merge(genres_tracks, songs_play_count)

In [66]:
genres_tracks_play_count.head()

Unnamed: 0,track_id,majority-genre,minority-genre,song_id,artist,title,play_count
0,TRAAAAK128F9318786,Rock,,SOBLFFE12AF72AA5BA,Adelitas Way,Scream,515
1,TRAAABD128F429CF47,Rock,RnB,SOCIWDW12A8C13D406,The Box Tops,Soul Deep,72
2,TRAAAED128E0783FAB,Jazz,,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,315
3,TRAAAEM128F93347B9,Electronic,,SOIGICF12A8C141BC5,Son Kite,Game & Watch,14
4,TRAAAFD128F92F423A,Punk,Rock,SOFSOCN12A8C143F5D,Gob,Face the Ashes,70


In [67]:
genres_tracks_play_count = genres_tracks_play_count.drop(['minority-genre', 'track_id',
                                                          'song_id'], axis=1)

In [68]:
genres_tracks_play_count.head()

Unnamed: 0,majority-genre,artist,title,play_count
0,Rock,Adelitas Way,Scream,515
1,Rock,The Box Tops,Soul Deep,72
2,Jazz,Jamie Cullum,It's About Time,315
3,Electronic,Son Kite,Game & Watch,14
4,Punk,Gob,Face the Ashes,70


In [69]:
genres_tracks_play_count.head()

Unnamed: 0,majority-genre,artist,title,play_count
0,Rock,Adelitas Way,Scream,515
1,Rock,The Box Tops,Soul Deep,72
2,Jazz,Jamie Cullum,It's About Time,315
3,Electronic,Son Kite,Game & Watch,14
4,Punk,Gob,Face the Ashes,70


In [70]:
cols_order = ['artist', 'title', 'play_count', 'majority-genre']
genres_tracks_play_count = genres_tracks_play_count[cols_order]

In [71]:
genres_tracks_play_count.head()

Unnamed: 0,artist,title,play_count,majority-genre
0,Adelitas Way,Scream,515,Rock
1,The Box Tops,Soul Deep,72,Rock
2,Jamie Cullum,It's About Time,315,Jazz
3,Son Kite,Game & Watch,14,Electronic
4,Gob,Face the Ashes,70,Punk


In [73]:
genre = input()
top_100_genres = genres_tracks_play_count[genres_tracks_play_count['majority-genre'] == genre]
top_100_genres = top_100_genres.sort_values(by=['play_count'], ascending=False)[:100].reset_index(drop=True)
top_100_genres['id'] = [i for i in range(100)]
cols = ['id', 'artist', 'title', 'play_count']
top_100_genres[cols]

Rock


Unnamed: 0,id,artist,title,play_count
0,0,Björk,Undo,648239
1,1,Kings Of Leon,Revelry,527893
2,2,Harmonia,Sehr kosmisch,425463
3,3,OneRepublic,Secrets,292642
4,4,Tub Ring,Invalid,268353
...,...,...,...,...
95,95,Metric,Gold Guns Girls,28148
96,96,Pearl Jam,Encore Break,27579
97,97,Daughtry,No Surprise,27187
98,98,Eric Clapton,Tears In Heaven,26999


# Collections

It should return on a given keyword (love, war, happiness) a dataframe (50 tracks) with the following fields: index number, artist name, track title, play count. The table should be sorted by the play count descendingly. Try different approaches to these recommendations:

* baseline - when you look for the keyword and the number of its occurrences in a song, filter using some threshold and then sorting it by the play count,

* word2vec - when you look not only for the keyword but for several similar tokens as well using word2vec,

* classification task - you may label your data and try classification algorithms that will predict for the other part of the dataset if a track belongs to a specific class.

Maybe you find some other interesting ideas on how to make those recommendations
better.

In [96]:
lines = []
with open(MXM_FILE_PATH, 'r') as f:
    for line in f:
        if line[0] != '#':
            lines.append(line.rstrip())

In [97]:
bag_of_words = lines[0][1:]
lines.pop(0)
bag_of_words = bag_of_words.split(',')

In [98]:
keywords = ('love', 'war', 'happiness')

In [99]:
'love' in bag_of_words, 'war' in bag_of_words, 'happiness' in bag_of_words

(True, True, False)

In [100]:
bag_of_words.append('happiness')

In [101]:
bag_of_words[-1]

'happiness'

In [118]:
stop_words = set(stopwords.words('english'))

In [189]:
glove_vectors = api.load('glove-twitter-50')



In [267]:
def get_song_text(sparse_song):
    song_text = []
    for freq in sparse_song:
        word_idx, cnt = freq.split(':')
        word_idx, cnt = int(word_idx), int(cnt)
        if bag_of_words[word_idx] not in stop_words:
            for _ in range(cnt):
                song_text.append(bag_of_words[word_idx])
    return ' '.join(song_text)

In [268]:
def get_song_vector(text):
    song_vec = None
    song_text = text.split()
    for word in song_text:
        if glove_vectors.has_index_for(word):
            if song_vec is None:
                song_vec = glove_vectors[word]
            else:
                song_vec = song_vec + glove_vectors[word]
    return song_vec

In [291]:
song_texts = []
songs_vecs = []
tracks_ids = []
cos_simialarities = []
for line in tqdm(lines):
    mxm_data = line.split(',')
    track_id = mxm_data[0]
    tracks_ids.append(track_id)
    sparse_song = mxm_data[2:]
    song_text = get_song_text(sparse_song)
    song_texts.append(song_text)
    song_vec = get_song_vector(song_text)
    songs_vecs.append(song_vec)
    if song_vec is not None:
        cos_simialarities.append(np.dot(keywords_vec, song_vec) / (np.linalg.norm(keywords_vec) * np.linalg.norm(song_vec)))
    else:
        cos_simialarities.append(0)

  0%|          | 0/210519 [00:00<?, ?it/s]

In [292]:
df_vecs = pd.DataFrame(data={
    'track_id': tracks_ids,
    'text': song_texts,
    'vec': songs_vecs,
    'cos_simialarity': cos_simialarities
})

In [293]:
df_vecs.head()

Unnamed: 0,track_id,text,vec,cos_simialarity
0,TRAAAAV128F421A322,love love know oh see see got never tell bette...,"[-2.0894647, 6.0178843, 0.25885606, -2.67679, ...",0.858466
1,TRAAABD128F429CF47,love love love love love love know know like l...,"[-5.171518, 13.092293, -12.755945, 6.236166, -...",0.886198
2,TRAAAED128E0783FAB,love love love know know know know know know k...,"[-7.3665357, 40.399227, -20.012774, 14.979369,...",0.840179
3,TRAAAEF128F4273421,like come got never never never want make hear...,"[-8.592678, 29.81589, -17.426216, 9.31735, -21...",0.771854
4,TRAAAEW128F42930C0,oh would whi whi whi wo still si place sun sun...,"[-11.162049, -2.3622503, 1.5965956, -1.3068831...",0.842946


In [281]:
df_vecs['vec'].iloc[5300]

In [296]:
top_50_collections = df_vecs.sort_values(by=['cos_simialarity'], ascending=False)[:50]
top_50_collections = top_50_collections.reset_index(drop=True)
top_50_collections.head()

Unnamed: 0,track_id,text,vec,cos_simialarity
0,TRWNCVL128F427EDD9,love love love love love love love love love l...,"[-16.550362, 16.045406, -2.1825294, 16.317165,...",0.92927
1,TREKZMW128E0792E42,que come go see got got let let life life life...,"[-13.222571, -2.488165, -5.7816067, 38.276566,...",0.927962
2,TROKOGO128F93145EB,love love never life du watch der der rise sur...,"[-0.65263754, 0.6674489, -1.321255, -0.7814939...",0.924847
3,TRJCENK128F4231FD8,love love love love love love love love yeah w...,"[-2.6857, 8.27684, 0.97549415, 12.149147, -4.7...",0.923518
4,TREPUYD128F145AC92,de life life world live still kill kill morn s...,"[-0.20297599, 2.9049802, -1.5434803, -1.553833...",0.921292


In [313]:
artists = []
titles = []
play_counts = []
for track_id in tqdm(top_50_collections['track_id']):
    track_info = unique_tracks[unique_tracks['track_id'] == track_id]
    song_id = track_info['song_id'].iloc[0]
    artist = track_info['artist'].iloc[0]
    title = track_info['title'].iloc[0]
    play_count = songs_play_count[songs_play_count['song_id'] == song_id]['play_count']
    if play_count.empty:
        play_count= None
    else:
        play_count = play_count.iloc[0]
    artists.append(artist)
    titles.append(title)
    play_counts.append(play_count)

  0%|          | 0/50 [00:00<?, ?it/s]

In [314]:
top_50_collections['artist'] = artists
top_50_collections['title'] = titles
top_50_collections['play_count'] = play_counts

In [315]:
top_50_collections.head()

Unnamed: 0,track_id,text,vec,cos_simialarity,artist,title,play_count
0,TRWNCVL128F427EDD9,love love love love love love love love love l...,"[-16.550362, 16.045406, -2.1825294, 16.317165,...",0.92927,Jack Ingram,Still Got Scars,
1,TREKZMW128E0792E42,que come go see got got let let life life life...,"[-13.222571, -2.488165, -5.7816067, 38.276566,...",0.927962,Eagle-Eye Cherry,Permanent Tears,78.0
2,TROKOGO128F93145EB,love love never life du watch der der rise sur...,"[-0.65263754, 0.6674489, -1.321255, -0.7814939...",0.924847,Her Space Holiday,Manic Expressive (Exit),25.0
3,TRJCENK128F4231FD8,love love love love love love love love yeah w...,"[-2.6857, 8.27684, 0.97549415, 12.149147, -4.7...",0.923518,Seven Mary Three,Southwestern State (LP Version),
4,TREPUYD128F145AC92,de life life world live still kill kill morn s...,"[-0.20297599, 2.9049802, -1.5434803, -1.553833...",0.921292,Beastie Boys,5-Piece Chicken Dinner,22.0


In [317]:
top_50_collections = top_50_collections.drop(['track_id', 'text', 'vec', 'cos_simialarity'],
                                             axis=1)

In [319]:
top_50_collections.head()

Unnamed: 0,artist,title,play_count
0,Jack Ingram,Still Got Scars,
1,Eagle-Eye Cherry,Permanent Tears,78.0
2,Her Space Holiday,Manic Expressive (Exit),25.0
3,Seven Mary Three,Southwestern State (LP Version),
4,Beastie Boys,5-Piece Chicken Dinner,22.0


In [320]:
idxs = [i for i in tqdm(range(50))]
top_50_collections['id'] = idxs

  0%|          | 0/50 [00:00<?, ?it/s]

In [321]:
top_50_collections.head()

Unnamed: 0,artist,title,play_count,id
0,Jack Ingram,Still Got Scars,,0
1,Eagle-Eye Cherry,Permanent Tears,78.0,1
2,Her Space Holiday,Manic Expressive (Exit),25.0,2
3,Seven Mary Three,Southwestern State (LP Version),,3
4,Beastie Boys,5-Piece Chicken Dinner,22.0,4


In [322]:
cols_order = ['id', 'artist', 'title', 'play_count']
top_50_collections = top_50_collections[cols_order]

In [323]:
top_50_collections.head()

Unnamed: 0,id,artist,title,play_count
0,0,Jack Ingram,Still Got Scars,
1,1,Eagle-Eye Cherry,Permanent Tears,78.0
2,2,Her Space Holiday,Manic Expressive (Exit),25.0
3,3,Seven Mary Three,Southwestern State (LP Version),
4,4,Beastie Boys,5-Piece Chicken Dinner,22.0


In [325]:
top_50_collections.to_csv('collections.csv', index=False)

# People similar to you listen