In [None]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix
import pickle

## Import data

In [None]:
user_profiles = pd.read_table("/home/dennis/Documents/lastfm-prediction/lastfm-dataset-360K/profiles.tsv", 
                         header=None, names=['user_id', 'gender', 'age', 'country', 'signup'])
print(user_profiles.head(5))
print(len(user_profiles[user_profiles['country'] == 'Germany']))

In [None]:
user_data = pd.read_table("/home/dennis/Documents/lastfm-prediction/lastfm-dataset-360K/plays.tsv", 
                         header=None, names=['user_id', 'artist_id', 'artist_name', 'plays'])
user_data.drop('artist_id', axis=1, inplace=True)
if user_data['artist_name'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['artist_name'])

## Only German Data
To reduce the size of the data, only german users will be considered

In [None]:
german_profiles = user_profiles[user_profiles['country'] == 'Germany']
user_data_ger_profiles = german_profiles.merge(user_data, on="user_id", how='left')
print(user_data_ger_profiles.head(5))
print(len(user_data_ger_profiles))

In [None]:
artist_plays = user_data_ger_profiles.groupby('artist_name')['plays'].sum().reset_index()
artist_plays.columns = ['artist_name', 'artist_total_plays']
print(artist_plays.head(5))


## Reduce number of artists
Only the most popular artsists will be considered for the recommender in order to avoid distorted data and to reduce the total size of the set.

In [None]:
print(artist_plays['artist_total_plays'].quantile(np.arange(.9, 1, .01)))
threshold = 90000
popular_artist_plays = artist_plays[artist_plays['artist_total_plays'] > threshold]

print(len(artist_plays))
print(len(popular_artist_plays['artist_name']))

In [None]:
user_with_artist_plays = user_data_ger_profiles.merge(popular_artist_plays, on='artist_name', how='inner')
user_with_artist_plays = user_with_artist_plays.sort_values('artist_total_plays', ascending=False)
print(user_with_artist_plays.head(5))

In [None]:
user_with_artist_plays = user_with_artist_plays.drop_duplicates(['user_id', 'artist_name'])
print(len(user_with_artist_plays))
wide_artist_data = user_with_artist_plays.pivot(index = 'artist_name', columns = 'user_id', values = 'artist_total_plays').fillna(0)
wide_artist_data_sparse = csr_matrix(wide_artist_data.values)


## Train model

In [None]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'auto')
model_knn.fit(wide_artist_data_sparse)
#model_knn.fit(wide_artist_data_sparse)


## Make recommendations

In [None]:
query_index = np.random.choice(wide_artist_data.shape[0])
print(query_index)
#print(wide_artist_data.iloc[query_index, :])
distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

print(distances.flatten())
distance_len = len(distances.flatten())
for i in range(1, min(distance_len, distance_len + 1)):
    if i == 1:
        print("Artist: %s" % wide_artist_data.index[query_index])
    print("Recommendation %s: %s - %s " % (i, wide_artist_data.index[indices[0][i]], distances.flatten()[i]))

## Save model

In [None]:
pickle.dump(model_knn, open('nn_recommender.sav', 'wb'))

In [None]:
os.listdir()

In [None]:
nn_loaded = pickle.load(open('nn_recommender.sav', 'rb'))
distances, indices = nn_loaded.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
print(distances.flatten())

## Save available artist-names

In [None]:
artists = user_with_artist_plays['artist_name'].unique()
artists = pd.DataFrame(artists)
artists.to_csv("artists.csv")

In [None]:
os.listdir()