In [14]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import pdb

%matplotlib inline

In [2]:
fulldf = pd.read_csv("data/train.csv")
artistnames = pd.read_csv("data/artists.csv")
artistclust = pd.read_csv("data/artist_clusters.csv")
artists = pd.merge(artistclust, artistnames)

In [20]:
artists

Unnamed: 0,artist,cluster,name
0,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,1,Liars
1,69c4cc43-8163-41c5-ac81-30946d27bb69,4,CunninLynguists
2,7a2e6b55-f149-4e74-be6a-30a1b1a387bb,0,The Desert Sessions
3,7002bf88-1269-4965-a772-4ba1e7a91eaa,4,Glenn Gould
4,dbf7c761-e332-467b-b4d9-aafe06bbcf8f,1,G. Love & Special Sauce
5,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,4,U2
6,8b0f05ce-354e-4121-9e0b-8b4732ea844f,4,Juanes
7,8363f94f-fd86-41b8-a56b-26eacb34f499,0,Summoning
8,2e41ae9c-afd2-4f20-8f1e-17281ce9b472,4,Gwen Stefani
9,c17f08f4-2542-46fb-97f3-3202d60c225a,0,Fear Factory


In [23]:
artistclusters = pd.Series(artists["cluster"].values,index=artists.artist).to_dict()

In [33]:
mask = np.random.binomial(1, .6, size=len(fulldf)).astype(bool)
traindf = fulldf[mask]
valdf = fulldf[~mask]

In [196]:
traindf.to_csv("traindf.csv", index=False)
valdf.to_csv("valdf.csv", index=False)

In [195]:
valdf.head()

Unnamed: 0,user,artist,plays
2,da9cf3f557161d54b76f24db64be9cc76db008e3,eeb1195b-f213-4ce1-b28c-8565211f8e43,708
4,b85fcaef67d2669cd99b334b5e8c8705263db2cf,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,220
5,feed7a0dc74c5251283a1505adf453a2061d08f7,1cc5adcd-1422-4b5c-a3cd-3ecd4f43f506,2113
7,5641e1e6f04868a61dc29f7227e34f4640163e9b,832a43c7-aa7d-439b-a6b4-4f1afa671c24,305
8,9f748976d303db79f61bf570d9549d6335b11b0e,2fddb92d-24b2-46a5-bf28-3aed46f4684c,705


In [34]:
train_dict = {}
for row in traindf.values:
    user = row[0]
    artist = row[1]
    plays = row[2]
    cluster = artistclusters[artist]
    
    if user not in train_dict:
        train_dict[user] = {"plays": {},
                            "clusters": [[], [], [], [], []]}
    train_dict[user]["plays"][artist] = plays
    train_dict[user]["clusters"][cluster].append(plays)
    

## Testing Accuracy

In [35]:
valdf.head()

Unnamed: 0,user,artist,plays
2,da9cf3f557161d54b76f24db64be9cc76db008e3,eeb1195b-f213-4ce1-b28c-8565211f8e43,708
4,b85fcaef67d2669cd99b334b5e8c8705263db2cf,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,220
5,feed7a0dc74c5251283a1505adf453a2061d08f7,1cc5adcd-1422-4b5c-a3cd-3ecd4f43f506,2113
7,5641e1e6f04868a61dc29f7227e34f4640163e9b,832a43c7-aa7d-439b-a6b4-4f1afa671c24,305
8,9f748976d303db79f61bf570d9549d6335b11b0e,2fddb92d-24b2-46a5-bf28-3aed46f4684c,705


In [39]:
len(train_dict.keys())

233282

In [43]:
u = "feed7a0dc74c5251283a1505adf453a2061d08f7"
a = "1cc5adcd-1422-4b5c-a3cd-3ecd4f43f506"
print train_dict[u]
artistclusters[a]

{'clusters': [[264, 406], [318, 2079, 190], [], [], [442, 625, 955, 240, 317, 977, 327, 218, 157, 682]], 'plays': {'c842d29f-a297-48cd-bb71-4f77fd672b16': 682, '648615ca-ca74-460d-928a-2bae67ae6d14': 264, 'bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8': 406, 'ba99a190-6065-4930-be3d-55ecc48e365d': 317, '14387b0f-765c-4852-852f-135335790466': 977, '4449ccf6-c948-4d33-aa97-b6ad98ce4b5b': 327, '8475297d-fb78-4630-8d74-9b87b6bb7cc8': 318, 'eaf6a7ca-105d-4a94-ba02-8c3e4040319a': 190, '84eac621-1c5a-49a1-9500-555099c6e184': 2079, 'af37c51c-0790-4a29-b995-456f98a6b8c9': 442, '766a2b45-441f-4096-af05-dbbca9518c9d': 157, '3c0eb318-d2ba-45aa-9077-b83746cc56da': 240, 'eddc0911-21fc-4327-ab90-ccc459ce1ef7': 955, 'c485632c-b784-4ee9-8ea1-c5fb365681fc': 218, '5441c29d-3602-4898-b1a1-b77fa23b8e50': 625}}


1

In [46]:
np.median(train_dict[u]["plays"].values())

327.0

In [187]:
def test_accuracy(user, artist, threshold):
    cluster = artistclusters[artist]
    try:
        user_median = np.median(train_dict[user]["plays"].values())
        cluster_median = user_median
        cluster_vals = train_dict[user]["clusters"][cluster]
        if len(cluster_vals) >= threshold:
            cluster_median = np.median(cluster_vals)

        return user_median, cluster_median
    except:
        return 118, 118

In [58]:
j = valdf.iterrows()
next(j)[1]

user      da9cf3f557161d54b76f24db64be9cc76db008e3
artist        eeb1195b-f213-4ce1-b28c-8565211f8e43
plays                                          708
Name: 2, dtype: object

In [182]:
row = next(j)[1]
actual = row["plays"]
user, cluster = test_accuracy(row["user"], row["artist"])
abs(actual - user) - abs(actual - cluster)
print actual, user, cluster, abs(actual - user) - abs(actual - cluster)

6
53 100.5 71.5 29.0


In [192]:
preds = []
print valdf.shape[0]
for i, row in enumerate(valdf.iterrows()):
    if i % 10000 == 0:
        print i,
    a = row[1]["artist"]
    u = row[1]["user"]
    actual = row[1]["plays"]
    
    user, cluster = test_accuracy(u, a, 10)
    preds.append(abs(actual - user) - abs(actual - cluster))

1663904
0 10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 110000 120000 130000 140000 150000 160000 170000 180000 190000 200000 210000 220000 230000 240000 250000 260000 270000 280000 290000 300000 310000 320000 330000 340000 350000 360000 370000 380000 390000 400000 410000 420000 430000 440000 450000 460000 470000 480000 490000 500000 510000 520000 530000 540000 550000 560000 570000 580000 590000 600000 610000 620000 630000 640000 650000 660000 670000 680000 690000 700000 710000 720000 730000 740000 750000 760000 770000 780000 790000 800000 810000 820000 830000 840000 850000 860000 870000 880000 890000 900000 910000 920000 930000 940000 950000 960000 970000 980000 990000 1000000 1010000 1020000 1030000 1040000 1050000 1060000 1070000 1080000 1090000 1100000 1110000 1120000 1130000 1140000 1150000 1160000 1170000 1180000 1190000 1200000 1210000 1220000 1230000 1240000 1250000 1260000 1270000 1280000 1290000 1300000 1310000 1320000 1330000 1340000 1350000 1360000 1370000 13

In [193]:
sum(preds)

-239750.5