In [1]:
import numpy as np
import pandas as pd

In [2]:
# six files provided
artists = pd.read_csv('../data/artists.dat', sep='\t',usecols=['id','name'])
plays = pd.read_csv('../data/user_artists.dat', sep='\t')
tags = pd.read_csv('../data/tags.dat', sep='\t',encoding='latin-1')
friends = pd.read_csv('../data/user_friends.dat', sep='\t') # we will not use the social network in our recommender
utat = pd.read_csv('../data/user_taggedartists-timestamps.dat', sep="\t")
uta = pd.read_csv('../data/user_taggedartists.dat', sep='\t')

In [3]:
# Explore specs of each file provided.  We will only use artists and plays in the basic system.
csv_list = [artists, plays, tags, friends, utat,uta]
csv_names = ['artists', 'plays', 'tags', 'friends', 'user_taggedartists-timestamps','user_taggedartists']
for i in range(len(csv_list)):
    name = csv_names[i]
    shape = csv_list[i].shape
    columns = csv_list[i].columns
    unique = csv_list[i].nunique()
    print("{}\n{}\n{}\n{}\n".format(name, shape, columns,unique))

artists
(17632, 2)
Index(['id', 'name'], dtype='object')
id      17632
name    17632
dtype: int64

plays
(92834, 3)
Index(['userID', 'artistID', 'weight'], dtype='object')
userID       1892
artistID    17632
weight       5436
dtype: int64

tags
(11946, 2)
Index(['tagID', 'tagValue'], dtype='object')
tagID       11946
tagValue    11946
dtype: int64

friends
(25434, 2)
Index(['userID', 'friendID'], dtype='object')
userID      1892
friendID    1892
dtype: int64

user_taggedartists-timestamps
(186479, 4)
Index(['userID', 'artistID', 'tagID', 'timestamp'], dtype='object')
userID        1892
artistID     12523
tagID         9749
timestamp     3549
dtype: int64

user_taggedartists
(186479, 6)
Index(['userID', 'artistID', 'tagID', 'day', 'month', 'year'], dtype='object')
userID       1892
artistID    12523
tagID        9749
day             4
month          12
year           10
dtype: int64



In [31]:
# merge 5 of 6 files; we will not use friend network at this time
artist_plays = pd.merge(artists, plays,how='left',left_on='id',right_on='artistID')
artist_plays = artist_plays.drop(columns=['id']) # drop duplicate columns
artist_plays['ever_played'] = artist_plays['weight'].apply(lambda x: 1 if x>0 else 0)

In [35]:
artist_plays

Unnamed: 0,name,userID,artistID,weight,ever_played
0,MALICE MIZER,34,1,212,1
1,MALICE MIZER,274,1,483,1
2,MALICE MIZER,785,1,76,1
3,Diary of Dreams,135,2,1021,1
4,Diary of Dreams,257,2,152,1
5,Diary of Dreams,325,2,3466,1
6,Diary of Dreams,397,2,56,1
7,Diary of Dreams,560,2,134,1
8,Diary of Dreams,580,2,803,1
9,Diary of Dreams,935,2,428,1


In [10]:
user_tagged = pd.merge(utat,uta,how='outer',on=['userID','tagID','artistID'])
assert len(user_tagged) == len(utat) == len(uta)

In [25]:
# left join as assuming we do not needs tags that have not been used by users
user_tags = pd.merge(user_tagged,tags,how='left',on='tagID')

In [26]:
user_tags2 = pd.merge(user_tags,artists,how='left',left_on='artistID',right_on='id')
user_tags3 = user_tags2.drop(columns='id')

In [13]:
artist_plays.nunique()

name        17632
userID       1892
artistID    17632
weight       5436
dtype: int64

In [27]:
user_tags3.nunique()

userID        1892
artistID     12523
tagID         9749
timestamp     3549
day              4
month           12
year            10
tagValue      9749
name         12133
dtype: int64

In [28]:
user_tags.nunique()

userID        1892
artistID     12523
tagID         9749
timestamp     3549
day              4
month           12
year            10
tagValue      9749
dtype: int64

In [15]:
friends.nunique()

userID      1892
friendID    1892
dtype: int64

In [29]:
user_tags3.head()

Unnamed: 0,userID,artistID,tagID,timestamp,day,month,year,tagValue,name
0,2,52,13,1238536800000,1,4,2009,chillout,Morcheeba
1,2,52,15,1238536800000,1,4,2009,downtempo,Morcheeba
2,2,52,18,1238536800000,1,4,2009,electronic,Morcheeba
3,2,52,21,1238536800000,1,4,2009,trip-hop,Morcheeba
4,2,52,41,1238536800000,1,4,2009,female vovalists,Morcheeba


In [17]:
friends.head()

Unnamed: 0,userID,friendID
0,2,275
1,2,428
2,2,515
3,2,761
4,2,831


In [34]:
# these files must be kept separate, as each userID/artistID pair features multiple tagID
# and each userID is associated with multiple friendID
artist_plays.to_pickle('../data/recsys_plays.pkl')
user_tags3.to_pickle('../data/recsys_tags.pkl')
friends.to_pickle('../data/recsys_friends.pkl')