In [1]:
import numpy as np
import pandas as pd

In [2]:
# six files provided
artists = pd.read_csv('../data/artists.dat', sep='\t',usecols=['id','name'])
plays = pd.read_csv('../data/user_artists.dat', sep='\t')
tags = pd.read_csv('../data/tags.dat', sep='\t',encoding='latin-1')
friends = pd.read_csv('../data/user_friends.dat', sep='\t') # we will not use the social network in our recommender
utat = pd.read_csv('../data/user_taggedartists-timestamps.dat', sep="\t")
uta = pd.read_csv('../data/user_taggedartists.dat', sep='\t')

In [3]:
# Explore specs of each file provided.  We will only use artists and plays in the basic system.
csv_list = [artists, plays, tags, friends, utat,uta]
csv_names = ['artists', 'plays', 'tags', 'friends', 'user_taggedartists-timestamps','user_taggedartists']
for i in range(len(csv_list)):
    name = csv_names[i]
    shape = csv_list[i].shape
    columns = csv_list[i].columns
    unique = csv_list[i].nunique()
    print("{}\n{}\n{}\n{}\n".format(name, shape, columns,unique))

artists
(17632, 2)
Index(['id', 'name'], dtype='object')
id      17632
name    17632
dtype: int64

plays
(92834, 3)
Index(['userID', 'artistID', 'weight'], dtype='object')
userID       1892
artistID    17632
weight       5436
dtype: int64

tags
(11946, 2)
Index(['tagID', 'tagValue'], dtype='object')
tagID       11946
tagValue    11946
dtype: int64

friends
(25434, 2)
Index(['userID', 'friendID'], dtype='object')
userID      1892
friendID    1892
dtype: int64

user_taggedartists-timestamps
(186479, 4)
Index(['userID', 'artistID', 'tagID', 'timestamp'], dtype='object')
userID        1892
artistID     12523
tagID         9749
timestamp     3549
dtype: int64

user_taggedartists
(186479, 6)
Index(['userID', 'artistID', 'tagID', 'day', 'month', 'year'], dtype='object')
userID       1892
artistID    12523
tagID        9749
day             4
month          12
year           10
dtype: int64



In [4]:
# merge 5 of 6 files; we will not use friend network at this time
artist_plays = pd.merge(artists, plays,how='left',left_on='id',right_on='artistID')
artist_plays = artist_plays.drop(columns=['id']) # drop duplicate columns
artist_plays['ever_played'] = artist_plays['weight'].apply(lambda x: 1 if x>0 else 0)

In [5]:
artist_plays.head()

Unnamed: 0,name,userID,artistID,weight,ever_played
0,MALICE MIZER,34,1,212,1
1,MALICE MIZER,274,1,483,1
2,MALICE MIZER,785,1,76,1
3,Diary of Dreams,135,2,1021,1
4,Diary of Dreams,257,2,152,1


In [6]:
artist_plays[artist_plays['name']=='MALICE MIZER']

Unnamed: 0,name,userID,artistID,weight,ever_played
0,MALICE MIZER,34,1,212,1
1,MALICE MIZER,274,1,483,1
2,MALICE MIZER,785,1,76,1


In [7]:
user_tagged = pd.merge(utat,uta,how='outer',on=['userID','tagID','artistID'])
assert len(user_tagged) == len(utat) == len(uta)

In [8]:
# left join as assuming we do not needs tags that have not been used by users
user_tags = pd.merge(user_tagged,tags,how='left',on='tagID')

In [9]:
user_tags2 = pd.merge(user_tags,artists,how='left',left_on='artistID',right_on='id')
user_tags3 = user_tags2.drop(columns='id')

In [10]:
artist_plays.nunique()

name           17632
userID          1892
artistID       17632
weight          5436
ever_played        1
dtype: int64

In [11]:
user_tags3.nunique()

userID        1892
artistID     12523
tagID         9749
timestamp     3549
day              4
month           12
year            10
tagValue      9749
name         12133
dtype: int64

In [12]:
user_tags.nunique()

userID        1892
artistID     12523
tagID         9749
timestamp     3549
day              4
month           12
year            10
tagValue      9749
dtype: int64

In [13]:
friends.nunique()

userID      1892
friendID    1892
dtype: int64

In [14]:
user_tags3.head()

Unnamed: 0,userID,artistID,tagID,timestamp,day,month,year,tagValue,name
0,2,52,13,1238536800000,1,4,2009,chillout,Morcheeba
1,2,52,15,1238536800000,1,4,2009,downtempo,Morcheeba
2,2,52,18,1238536800000,1,4,2009,electronic,Morcheeba
3,2,52,21,1238536800000,1,4,2009,trip-hop,Morcheeba
4,2,52,41,1238536800000,1,4,2009,female vovalists,Morcheeba


In [15]:
friends.head()

Unnamed: 0,userID,friendID
0,2,275
1,2,428
2,2,515
3,2,761
4,2,831


In [16]:
print("Dataframe Shapes")
print("Artist Plays: {}".format(len(artist_plays)))
print("User Tags: {}".format(len(user_tags3)))
print("Friends: {}".format(len(friends)))

Dataframe Shapes
Artist Plays: 92834
User Tags: 186479
Friends: 25434


In [17]:
# these files must be kept separate, as each userID/artistID pair features multiple tagID
# and each userID is associated with multiple friendID
artist_plays.to_pickle('../data/recsys_plays.pkl')
user_tags3.to_pickle('../data/recsys_tags.pkl')
friends.to_pickle('../data/recsys_friends.pkl')

In [18]:
# aggregate data for FusionAI@Lucidworks exercise
artist_plays.to_csv('../data/recsys_plays.csv', index=False)
user_tags3.to_csv('../data/recsys_tags.csv', index=False)

In [60]:
# create sparse matrix of artists with tags
ut4 = user_tags3[['name','tagValue']].drop_duplicates()
ut5 = pd.concat([ut4.drop('tagValue', 1), pd.get_dummies(user_tags4.tagValue).mul(1)], axis=1)
ut6 = ut5.groupby('name').sum()

In [67]:
# number of unique artists in the tags data
len(set(user_tags3['name']))

12134

In [61]:
ut6.head()

Unnamed: 0_level_0,'80s,-pearl fashion music,0 play yet,00,00's,007,00s,00s rock,1,1008,...,zikirli,zmiel pierogi,zmierzch,zombie,zombie rave,zombieland,zoocore,zornish,ztt,zu
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#####,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
$lick,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(hed) Planet Earth,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
*NSYNC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
ut6.shape

(12133, 9749)

In [70]:
# Britney Spears has 329 tags
ut6.loc['Britney Spears'].sum()

329

In [76]:
ut6[ut6.index=='Britney Spears']

Unnamed: 0_level_0,'80s,-pearl fashion music,0 play yet,00,00's,007,00s,00s rock,1,1008,...,zikirli,zmiel pierogi,zmierzch,zombie,zombie rave,zombieland,zoocore,zornish,ztt,zu
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Britney Spears,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
ut6.to_csv('../data/recsys_artist_tag_sparse.csv', index=False)

In [95]:
ap2 = artist_plays.pivot(index='name',columns='userID',values='weight').fillna(0)

In [96]:
ap2.shape

(17632, 1892)

In [97]:
ap2.head()

userID,2,3,4,5,6,7,8,9,10,11,...,2090,2091,2092,2093,2094,2095,2096,2097,2099,2100
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
!DISTAIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
!deladap,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#####,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#2 Orchestra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [98]:
# 2.4 million artist plays for Britney Spears
ap2.loc['Britney Spears'].sum()

2393140.0

In [99]:
ap2.to_csv('../data/recsys_artist_play_sparse.csv', index=False)

Below: Experiment with different data structures, such as json/dictionary, for Fusion ingestion

In [None]:
n = 20 #len(user_tags3)

ad = {}
for i in range(n):
    name = user_tags3['name'].iloc[i]
    tagValue = user_tags3['tagValue'].iloc[i]
    if name in ad:
        ad[name].append(tagValue)
    else:
        ad[name] = [tagValue]

In [None]:
ad

In [None]:
user_tags4 = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in ad.items() ]))

In [None]:
user_tags4

In [None]:
# user_tags4 = pd.DataFrame.from_dict(ad)

In [None]:
user_tags3.iloc[1]