In [1]:
import numpy as np
import pandas as pd

In [3]:
# Read in JSON into dataframe
df = pd.read_json('StreamingHistory.json')

In [4]:
# get number of rows
half_rows = df['artistName'].count() / 2

In [5]:
# drop all columns besides artistNames
df = df.drop(columns=['trackName', 'endTime', 'msPlayed'])

In [6]:
# split into 2 dfs
df1, df2 = np.split(df, [int(half_rows)], axis=0)

In [7]:
# generate 2 dataframes with unique artists and their frequencies
df1_unique_freq = df1['artistName'].value_counts(normalize=True).rename_axis('artistName').reset_index(name='freq')
df2_unique_freq = df2['artistName'].value_counts(normalize=True).rename_axis('artistName').reset_index(name='freq')

print(df1_unique_freq)
print('----------------------')
print(df2_unique_freq)

                            artistName      freq
0                                Flume  0.082245
1                         Janis Joplin  0.077891
2                             Jai Wolf  0.067731
3                                 GRiZ  0.066280
4    Big Brother & The Holding Company  0.045477
..                                 ...       ...
381                               Cozz  0.000484
382                         The Police  0.000484
383                        The Beatles  0.000484
384                              Cults  0.000484
385                          The-Dream  0.000484

[386 rows x 2 columns]
----------------------
          artistName      freq
0         Kanye West  0.061925
1    Johannes Brahms  0.050798
2              Flume  0.049347
3    Vampire Weekend  0.048379
4           Big Wild  0.044509
..               ...       ...
307      The Strokes  0.000484
308    Booty&theKidd  0.000484
309            Arlie  0.000484
310     Good Morning  0.000484
311  The Mary Nixons  0.

In [8]:
# create ndarray of all unique artists from both dfs
full_unique = pd.Series.append(df1_unique_freq['artistName'], df2_unique_freq['artistName']).unique()

In [60]:
# # create dataframe with combined unique artists and frequencies with NaN values filled with 0
test = pd.merge(df1_unique_freq, df2_unique_freq, on="artistName", how="outer").fillna(0)
print(test)

                            artistName    freq_x  \
0                                Flume  0.082245   
1                         Janis Joplin  0.077891   
2                             Jai Wolf  0.067731   
3                                 GRiZ  0.066280   
4    Big Brother & The Holding Company  0.045477   
..                                 ...       ...   
531                         Miike Snow  0.000000   
532               Ludwig van Beethoven  0.000000   
533                      Booty&theKidd  0.000000   
534                              Arlie  0.000000   
535                    The Mary Nixons  0.000000   

                             artisName                    artistName_full  \
0                                Flume                              Flume   
1                         Janis Joplin                       Janis Joplin   
2                             Jai Wolf                           Jai Wolf   
3                                 GRiZ                             

In [76]:
# Construct final dataset
df_x = pd.DataFrame.from_dict({ 'userId' : [0 for i in range(test['artistName'].size)], 'artistId' : [i for i in range(test['artistName'].size)], 'freq' : test['freq_x']})
df_y = pd.DataFrame.from_dict({ 'userId' : [1 for i in range(test['artistName'].size)], 'artistId' : [i for i in range(test['artistName'].size)], 'freq' : test['freq_y']})
dataset = pd.DataFrame.append(df_x, df_y)
print(dataset)

     userId  artistId      freq
0         0         0  0.082245
1         0         1  0.077891
2         0         2  0.067731
3         0         3  0.066280
4         0         4  0.045477
..      ...       ...       ...
531       1       531  0.000484
532       1       532  0.000484
533       1       533  0.000484
534       1       534  0.000484
535       1       535  0.000484

[1072 rows x 3 columns]


In [88]:
dataset_no_zeros = dataset[dataset['freq'] != 0]
dataset_zeros = dataset[dataset['freq'] != 0]
print(dataset_no_zeros)

     userId  artistId      freq
0         0         0  0.082245
1         0         1  0.077891
2         0         2  0.067731
3         0         3  0.066280
4         0         4  0.045477
..      ...       ...       ...
531       1       531  0.000484
532       1       532  0.000484
533       1       533  0.000484
534       1       534  0.000484
535       1       535  0.000484

[698 rows x 3 columns]


In [66]:
from surprise import Reader, Dataset

In [97]:
# import data into suprise dataset
reader = Reader()

data_no_zeros = Dataset.load_from_df(dataset_no_zeros[['userId', 'artistId', 'freq']], reader)

In [102]:
# split dataset into train and test
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data_no_zeros, test_size=0.25)
print(testset)

[(1, 68, 0.0009675858732462506), (0, 81, 0.001451378809869376), (1, 420, 0.001451378809869376), (0, 145, 0.0009675858732462506), (0, 119, 0.0009675858732462506), (0, 159, 0.0009675858732462506), (1, 196, 0.0004837929366231253), (1, 422, 0.001451378809869376), (1, 509, 0.0004837929366231253), (0, 334, 0.0004837929366231253), (0, 235, 0.0004837929366231253), (0, 222, 0.0004837929366231253), (0, 237, 0.0004837929366231253), (0, 306, 0.0004837929366231253), (1, 500, 0.0004837929366231253), (0, 15, 0.012094823415578132), (0, 228, 0.0004837929366231253), (0, 259, 0.0004837929366231253), (0, 308, 0.0004837929366231253), (0, 57, 0.0019351717464925011), (1, 92, 0.0019351717464925011), (0, 111, 0.0009675858732462506), (0, 8, 0.02467343976777939), (0, 330, 0.0004837929366231253), (1, 36, 0.004837929366231253), (1, 239, 0.0004837929366231253), (1, 459, 0.0009675858732462506), (1, 431, 0.0009675858732462506), (1, 302, 0.0019351717464925011), (0, 20, 0.006289308176100629), (0, 223, 0.000483792936623

In [84]:
# fit SVD model
from surprise import SVD, accuracy

algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x119eb7150>

In [85]:
# Make prediction
predictions = algo.test(testset)

In [86]:
# Test model accuracy use root mean squared error
from surprise import accuracy
accuracy.rmse(predictions)

RMSE: 0.9971


0.9970894202423917

In [89]:
data_zeros = Dataset.load_from_df(dataset_zeros[['userId', 'artistId', 'freq']], reader)

In [103]:
print(type(data_zeros))

trainset, testset = train_test_split(data_zeros, test_size=1)



<class 'surprise.dataset.DatasetAutoFolds'>


In [104]:
predictions = algo.test(testset)

[Prediction(uid=1, iid=195, r_ui=0.0004837929366231253, est=1, details={'was_impossible': False})]
