![](https://ws1.sinaimg.cn/large/0073xHwmgy1g0npxol8xfj312m0h00uo.jpg)

## load data

In [1]:
import arrow
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, diags


file_path = '~/music-recommend/dataset/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv'
df = pd.read_csv(file_path, 
            sep = '\t',
            header = None,                   
            names = ['user_id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name'],
            usecols = ['user_id', 'timestamp', 'track_id', 'artist_name', 'track_name'],
           )
df = df.dropna()
print (df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16936136 entries, 10 to 19098861
Data columns (total 5 columns):
user_id        object
timestamp      object
artist_name    object
track_id       object
track_name     object
dtypes: object(5)
memory usage: 775.3+ MB
None


## util data

In [2]:
df['user_id'] = df['user_id'].astype('category')
df['track_id'] = df['track_id'].astype('category')

user_index_to_user_id_dict = df['user_id'].cat.categories # use it like a dict.
user_id_to_user_index_dict = dict()
for index, i in enumerate(df['user_id'].cat.categories):
    user_id_to_user_index_dict[i] = index
    
track_index_to_track_id_dict = df['track_id'].cat.categories # use it like a dict.
track_id_to_track_index_dict = dict()
for index, i in enumerate(df['track_id'].cat.categories):
    track_id_to_track_index_dict[i] = index
    
song_info_df = df[['artist_name', 'track_name', 'track_id']].drop_duplicates()

## util func

In [3]:
def get_hot_track_id_by_artist_name_and_track_name(artist_name, track_name):
    track = song_info_df[(song_info_df['artist_name'] == artist_name) & (song_info_df['track_name'] == track_name)]
    max_listened = 0
    hotest_row_index = 0
    for i in range(track.shape[0]):
        row = track.iloc[i]
        track_id = row['track_id']
        listened_count = df[df['track_id'] == track_id].shape[0]
        if listened_count > max_listened:
            max_listened = listened_count
            hotest_row_index = i
    return track.iloc[hotest_row_index]['track_id']

print ('wish you were here tracks:')
print (song_info_df[(song_info_df['artist_name'] == 'Pink Floyd') & (song_info_df['track_name'] == 'Wish You Were Here')][['track_id']])
print ('--------')
print ('hotest one:')
print (get_hot_track_id_by_artist_name_and_track_name('Pink Floyd', 'Wish You Were Here'))

wish you were here tracks:
                                      track_id
60969     feecff58-8ee2-4a7f-ac23-dc8ce7925286
4401932   f479e316-56b4-4221-acd9-eed1a0711861
17332322  2210ba38-79af-4881-97ae-4ce8f32322c3
--------
hotest one:
feecff58-8ee2-4a7f-ac23-dc8ce7925286


## generate sentence file

In [None]:
def generate_sentence_file(df):
    with open('sentences.txt', 'w') as sentences:
        for user_index in tqdm(range(len(user_index_to_user_id_dict))):
            user_id = user_index_to_user_id_dict[user_index]
            user_df = df[df['user_id'] == user_id].sort_values('timestamp')
            session = list()
            last_time = None
            for index, row in user_df.iterrows():
                this_time = row['timestamp']
                track_index = track_id_to_track_index_dict[row['track_id']]
                if arrow.get(this_time).date() != arrow.get(last_time).date() and last_time != None:
                    sentences.write(' '.join([str(_id) for _id in session]) + '\n')
                    session = list()
                session.append(track_index)
                last_time = this_time

## generate sentence file using spark

In [None]:
import pyspark
import arrow

sc = pyspark.SparkContext(appName="generate-song-sentences")
lines = sc.textFile(file_path)
lines = lines.map(lambda l: l.split("\t")) # split by \t.
lines = lines.filter(lambda l: len([item for item in l if not item]) == 0) # remove na lines
lines = lines.map(lambda line: ((line[0], arrow.get(line[1]).date()), str(track_id_to_track_index_dict.get(line[4], '')))) # (user_id, date) is key，track_id is value.
lines = lines.reduceByKey(lambda a, b: a + ' ' + b) # reduce，use track_id as sentence.
lines = lines.map(lambda line: line[1]) # don't care key anymore.
lines.repartition(1).saveAsTextFile("./spark-generated-song-sentences") # write to one txt file.

![](https://ws1.sinaimg.cn/large/0073xHwmgy1g0o99odtizj311w0ee79v.jpg)

## gensim train model

In [None]:
from smart_open import smart_open
from gensim.models import Word2Vec
import logging

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

class LastfmSentences(object):
    
    def __init__(self, file_location):
        self.file_location = file_location
    
    def __iter__(self):
        for line in smart_open(self.file_location, 'r'):
            yield line.split()
            

lastfm_sentences = LastfmSentences('~/onedrive/sentences.txt')
model = Word2Vec(lastfm_sentences, size=50, min_count=20, window=10, hs=0, negative=20, workers=4, sg=1, sample=1e-5)

## song similarity

In [5]:
shine_on_part_1 = str(track_id_to_track_index_dict[
    get_hot_track_id_by_artist_name_and_track_name('Pink Floyd', 'Shine On You Crazy Diamond (Parts I-V)')])
shine_on_part_2 = str(track_id_to_track_index_dict[
    get_hot_track_id_by_artist_name_and_track_name('Pink Floyd', 'Shine On You Crazy Diamond (Parts Vi-Ix)')])
good_times = str(track_id_to_track_index_dict[
    get_hot_track_id_by_artist_name_and_track_name('Chic', 'Good Times')])

print ('similarity between shine on part 1, 2:', model.wv.similarity(shine_on_part_1, shine_on_part_2))
print ('similarity between shine on part 1, good times:', model.wv.similarity(shine_on_part_1, good_times))

similarity between shine on part 1, 2: 0.942483
similarity between shine on part 1, good times: 0.381922


## recommend songs

In [6]:
def recommend_with_playlist(playlist, topn=25):
    if not isinstance(playlist, list):
        playlist = [playlist]
    playlist_indexes = [str(track_id_to_track_index_dict[track_id]) for track_id in playlist]
    similar_song_indexes = model.wv.most_similar(positive=playlist_indexes, topn=topn)
    return [track_index_to_track_id_dict[int(track[0])] for track in similar_song_indexes]

def display_track_info(track_ids):
    track_info = {
        'track_name': [],
        'artist_name': [],
    }
    for track_id in track_ids:
        track = song_info_df[song_info_df['track_id'] == track_id].iloc[0]
        track_info['track_name'].append(track['track_name'])
        track_info['artist_name'].append(track['artist_name'])
    print (pd.DataFrame(track_info))

## post punk

In [7]:
guerbai_playlist = [
    ('Joy Division', 'Disorder'),
    ('Echo & The Bunnymen', 'The Killing Moon'),
    ('The Names', 'Discovery'),
    ('The Cure', 'Lullaby'),
    
]

display_track_info(recommend_with_playlist([
    get_hot_track_id_by_artist_name_and_track_name(track[0], track[1]) 
    for track in guerbai_playlist], 20))

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


                                           track_name          artist_name
0   Singing Rule Britannia (While The Walls Close In)       The Chameleons
1                                       Miss The Girl        The Creatures
2                            Return Of The Roughnecks       The Chameleons
3                                    Looking Inwardly       The Chameleons
4                                        P.S. Goodbye       The Chameleons
5                          Home Is Where The Heart Is       The Chameleons
6                             Love Will Tear Us Apart         Boy Division
7                                Intrigue In Tangiers       The Chameleons
8                                   Pleasure And Pain       The Chameleons
9                                        Paper Tigers       The Chameleons
10                                           Faceless      A Certain Ratio
11                                  True Faith (Live)            New Order
12                       

## progressive rock

In [8]:
guerbai_playlist = [
    ('Rush', '2112: Ii. The Temples Of Syrinx'),
    ('Yes', 'Roundabout'),
    ('Emerson, Lake & Palmer', 'Take A Pebble'),
    ('Jethro Tull', 'Aqualung'),
]

display_track_info(recommend_with_playlist([
    get_hot_track_id_by_artist_name_and_track_name(track[0], track[1]) 
    for track in guerbai_playlist]))

                   track_name             artist_name
0                   Musicatto                  Kansas
1          Living In The Past             Jethro Tull
2         Brain Salad Surgery  Emerson, Lake & Palmer
3           Farewell To Kings                    Rush
4             Working All Day            Gentle Giant
5                        Tank  Emerson, Lake & Palmer
6           My Sunday Feeling             Jethro Tull
7           The Thousand Days                      Iq
8         Just Changing Hands                      Iq
9                     Corners                      Iq
10                  Came Down                      Iq
11           Starship Trooper                     Yes
12                Breathtaker                      Iq
13                 Knife Edge  Emerson, Lake & Palmer
14                     Bourée             Jethro Tull
15              Leap Of Faith                      Iq
16  With You There To Help Me             Jethro Tull
17             Unsolid Groun