In [1]:
import numpy as np
import pandas as pd

In [3]:
# Read in JSON into dataframe
df = pd.read_json('StreamingHistory.json')

In [4]:
# get number of rows
half_rows = df['artistName'].count() / 2

In [5]:
# drop all columns besides artistNames
df = df.drop(columns=['trackName', 'endTime', 'msPlayed'])

In [6]:
# split into 2 dfs
df1, df2 = np.split(df, [int(half_rows)], axis=0)

In [158]:
# generate 2 dataframes with unique artists and their frequencies
df1_unique_freq = df1['artistName'].value_counts(normalize=True).rename_axis('artistName').reset_index(name='freq')
df2_unique_freq = df2['artistName'].value_counts(normalize=True).rename_axis('artistName').reset_index(name='freq')


df1_unique_freq['freq'] = pd.Series.round(df1_unique_freq['freq'] * 10000)
df2_unique_freq['freq'] = pd.Series.round(df2_unique_freq['freq'] * 10000)


print(df1_unique_freq)
print('----------------------')
print(df2_unique_freq)

                            artistName   freq
0                                Flume  822.0
1                         Janis Joplin  779.0
2                             Jai Wolf  677.0
3                                 GRiZ  663.0
4    Big Brother & The Holding Company  455.0
..                                 ...    ...
381             A Boogie Wit da Hoodie    5.0
382                               Cozz    5.0
383                               Tyga    5.0
384                            6ix9ine    5.0
385                           Galantis    5.0

[386 rows x 2 columns]
----------------------
          artistName   freq
0         Kanye West  619.0
1    Johannes Brahms  508.0
2              Flume  493.0
3    Vampire Weekend  484.0
4           Big Wild  445.0
..               ...    ...
307    Chrome Sparks    5.0
308       Miike Snow    5.0
309       Boz Scaggs    5.0
310   Electric Guest    5.0
311        Sure Sure    5.0

[312 rows x 2 columns]


In [8]:
# create ndarray of all unique artists from both dfs
full_unique = pd.Series.append(df1_unique_freq['artistName'], df2_unique_freq['artistName']).unique()

In [60]:
# # create dataframe with combined unique artists and frequencies with NaN values filled with 0
test = pd.merge(df1_unique_freq, df2_unique_freq, on="artistName", how="outer").fillna(0)
print(test)

                            artistName    freq_x  \
0                                Flume  0.082245   
1                         Janis Joplin  0.077891   
2                             Jai Wolf  0.067731   
3                                 GRiZ  0.066280   
4    Big Brother & The Holding Company  0.045477   
..                                 ...       ...   
531                         Miike Snow  0.000000   
532               Ludwig van Beethoven  0.000000   
533                      Booty&theKidd  0.000000   
534                              Arlie  0.000000   
535                    The Mary Nixons  0.000000   

                             artisName                    artistName_full  \
0                                Flume                              Flume   
1                         Janis Joplin                       Janis Joplin   
2                             Jai Wolf                           Jai Wolf   
3                                 GRiZ                             

In [162]:
# Construct final dataset
df_x = pd.DataFrame.from_dict({ 'userId' : [0 for i in range(test['artistName'].size)], 'artistId' : [i for i in range(test['artistName'].size)], 'freq' : test['freq_x']})
df_y = pd.DataFrame.from_dict({ 'userId' : [1 for i in range(test['artistName'].size)], 'artistId' : [i for i in range(test['artistName'].size)], 'freq' : test['freq_y']})
dataset = pd.DataFrame.append(df_x, df_y)

print(df_x)

     userId  artistId   freq
0         0         0  822.0
1         0         1  779.0
2         0         2  677.0
3         0         3  663.0
4         0         4  455.0
..      ...       ...    ...
531       0       531    NaN
532       0       532    NaN
533       0       533    NaN
534       0       534    NaN
535       0       535    NaN

[536 rows x 3 columns]


In [198]:
dataset_no_zeros = dataset[dataset['freq'].notna()]
dataset_zeros = dataset[dataset['freq'].isna()]

print(dataset_no_zeros)

     userId  artistId   freq
0         0         0  822.0
1         0         1  779.0
2         0         2  677.0
3         0         3  663.0
4         0         4  455.0
..      ...       ...    ...
531       1       531    5.0
532       1       532    5.0
533       1       533    5.0
534       1       534    5.0
535       1       535    5.0

[698 rows x 3 columns]


In [213]:
# to string

dataset_no_zeros['userId'] = dataset_no_zeros['userId'].astype(str)
dataset_no_zeros['artistId'] = dataset_no_zeros['artistId'].astype(str)

print(dataset_no_zeros.dtypes)

userId       object
artistId     object
freq        float64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [214]:
from surprise import Reader, Dataset

In [215]:
# import data into suprise dataset
reader = Reader()

data_no_zeros = Dataset.load_from_df(dataset_no_zeros[['userId', 'artistId', 'freq']], reader)

In [216]:
# split dataset into train and test
from surprise.model_selection import train_test_split

# trainset, testset = train_test_split(data_no_zeros, test_size=0.01)

trainset = data_no_zeros.build_full_trainset()
print(trainset)

<surprise.trainset.Trainset object at 0x11d8445d0>


In [217]:
# fit SVD model
from surprise import SVD, accuracy

algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11d85a790>

In [218]:
# Make prediction
predictions = algo.test(testset)

In [219]:
# Test model accuracy use root mean squared error
from surprise import accuracy
accuracy.rmse(predictions)

RMSE: nan


nan

In [220]:
data_zeros = Dataset.load_from_df(dataset_zeros[['userId', 'artistId', 'freq']], reader)

In [225]:
print(type(data_zeros))

trainset, testset = train_test_split(data_zeros, test_size=.999)

<class 'surprise.dataset.DatasetAutoFolds'>


In [238]:

predictions = algo.test(testset)

uids = testset[:]
print(uids)

[(1, 385, nan), (1, 202, nan), (1, 306, nan), (1, 368, nan), (0, 485, nan), (0, 418, nan), (1, 292, nan), (0, 405, nan), (0, 466, nan), (1, 350, nan), (1, 263, nan), (1, 148, nan), (1, 163, nan), (1, 375, nan), (0, 491, nan), (0, 416, nan), (1, 114, nan), (1, 358, nan), (1, 199, nan), (1, 60, nan), (1, 92, nan), (0, 394, nan), (0, 528, nan), (0, 497, nan), (1, 204, nan), (1, 152, nan), (0, 500, nan), (0, 435, nan), (1, 195, nan), (0, 413, nan), (1, 223, nan), (0, 434, nan), (1, 371, nan), (0, 511, nan), (1, 133, nan), (0, 457, nan), (1, 331, nan), (0, 401, nan), (1, 365, nan), (0, 467, nan), (0, 403, nan), (0, 468, nan), (1, 327, nan), (0, 483, nan), (1, 75, nan), (1, 94, nan), (0, 425, nan), (1, 344, nan), (0, 471, nan), (1, 211, nan), (0, 447, nan), (0, 431, nan), (1, 46, nan), (0, 488, nan), (0, 438, nan), (0, 470, nan), (0, 525, nan), (1, 151, nan), (1, 128, nan), (0, 480, nan), (1, 180, nan), (1, 194, nan), (1, 138, nan), (0, 392, nan), (1, 91, nan), (1, 239, nan), (0, 486, nan), 

[Prediction(uid=1, iid=195, r_ui=0.0004837929366231253, est=1, details={'was_impossible': False})]
