In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read in JSON into dataframe
df = pd.read_json('StreamingHistory.json')

In [3]:
# get number of rows
half_rows = df['artistName'].count() / 2

In [4]:
# drop all columns besides artistNames
df = df.drop(columns=['trackName', 'endTime', 'msPlayed'])

In [5]:
# split into 2 dfs
df1, df2 = np.split(df, [int(half_rows)], axis=0)

In [6]:
# generate 2 dataframes with unique artists and their frequencies
df1_unique_freq = df1['artistName'].value_counts(normalize=True).rename_axis('artistName').reset_index(name='freq')
df2_unique_freq = df2['artistName'].value_counts(normalize=True).rename_axis('artistName').reset_index(name='freq')

# stretch rating scale from 0-1000
df1_unique_freq['freq'] = df1_unique_freq['freq'] * 100
df2_unique_freq['freq'] = df2_unique_freq['freq'] * 100


print(df1_unique_freq)
print('----------------------')
print(df2_unique_freq)

                            artistName      freq
0                                Flume  8.224480
1                         Janis Joplin  7.789066
2                             Jai Wolf  6.773101
3                                 GRiZ  6.627963
4    Big Brother & The Holding Company  4.547654
..                                 ...       ...
381                           Magroove  0.048379
382                             FIDLAR  0.048379
383                        Cheat Codes  0.048379
384                         Pivot Gang  0.048379
385                       Unlike Pluto  0.048379

[386 rows x 2 columns]
----------------------
                            artistName      freq
0                           Kanye West  6.192550
1                      Johannes Brahms  5.079826
2                                Flume  4.934688
3                      Vampire Weekend  4.837929
4                             Big Wild  4.450895
..                                 ...       ...
307                   

In [7]:
# create ndarray of all unique artists from both dfs
full_unique = pd.Series.append(df1_unique_freq['artistName'], df2_unique_freq['artistName']).unique()

print(full_unique.size)

536


In [8]:
# # create dataframe with combined unique artists and frequencies with NaN values filled with 0
test = pd.merge(df1_unique_freq, df2_unique_freq, on="artistName", how="outer")
print(test)

                            artistName    freq_x    freq_y
0                                Flume  8.224480  4.934688
1                         Janis Joplin  7.789066  0.290276
2                             Jai Wolf  6.773101  1.161103
3                                 GRiZ  6.627963  1.838413
4    Big Brother & The Holding Company  4.547654       NaN
..                                 ...       ...       ...
531                               Pond       NaN  0.048379
532                    CharlesTheFirst       NaN  0.048379
533                       Current Joys       NaN  0.048379
534           Nick Murphy / Chet Faker       NaN  0.048379
535  Joe Hertler & The Rainbow Seekers       NaN  0.048379

[536 rows x 3 columns]


In [9]:
# Construct full dataset and convert catagorical artistName and artistId to numerical ids
df_x = pd.DataFrame.from_dict({ 'userId' : [0 for i in range(test['artistName'].size)], 'artistId' : [i for i in range(test['artistName'].size)], 'freq' : test['freq_x']})
df_y = pd.DataFrame.from_dict({ 'userId' : [1 for i in range(test['artistName'].size)], 'artistId' : [i for i in range(test['artistName'].size)], 'freq' : test['freq_y']})
dataset = pd.DataFrame.append(df_x, df_y)

print(dataset)

     userId  artistId      freq
0         0         0  8.224480
1         0         1  7.789066
2         0         2  6.773101
3         0         3  6.627963
4         0         4  4.547654
..      ...       ...       ...
531       1       531  0.048379
532       1       532  0.048379
533       1       533  0.048379
534       1       534  0.048379
535       1       535  0.048379

[1072 rows x 3 columns]


In [29]:
# seperate dataset into dataframe with only NaN and no NaN
dataset_no_zeros = dataset[dataset['freq'].notna()]
dataset_zeros = dataset[dataset['freq'].isna()]

print(dataset_no_zeros)


     userId  artistId      freq
0         0         0  8.224480
1         0         1  7.789066
2         0         2  6.773101
3         0         3  6.627963
4         0         4  4.547654
..      ...       ...       ...
531       1       531  0.048379
532       1       532  0.048379
533       1       533  0.048379
534       1       534  0.048379
535       1       535  0.048379

[698 rows x 3 columns]


In [11]:
# # userID and artistId to string

# dataset_no_zeros['userId'] = dataset_no_zeros['userId'].astype(str)
# dataset_no_zeros['artistId'] = dataset_no_zeros['artistId'].astype(str)

# print(dataset_no_zeros.dtypes)

In [12]:
from surprise import Reader, Dataset

In [13]:
# import data into suprise dataset
reader = Reader(rating_scale=(0, 100))

data_no_zeros = Dataset.load_from_df(dataset_no_zeros[['userId', 'artistId', 'freq']], reader)

In [14]:
# split dataset into train and test
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data_no_zeros, test_size=0.2)

trainset = data_no_zeros.build_full_trainset()
print(trainset)

<surprise.trainset.Trainset object at 0x116e94e90>


In [15]:
# fit SVD model
from surprise import SVD, accuracy

algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x116cebc50>

In [16]:
# Make prediction
predictions = algo.test(testset)

In [17]:
# Test model accuracy use root mean squared error
from surprise import accuracy
accuracy.rmse(predictions)

RMSE: 0.6968


0.6967793134981018

In [18]:
# Load in full data df into suprise dataset object
data = Dataset.load_from_df(dataset[['userId', 'artistId', 'freq']], reader)

In [19]:
# cast suprise dataset object into suprise trainset object

data = data.build_full_trainset()


print(data.rating_scale)

(0, 100)


In [20]:
# Get predictions for NaN values

uids = [data.to_raw_uid(uid) for uid in dataset_zeros['userId']]
iids = [data.to_raw_iid(iid) for iid in dataset_zeros['artistId']]
r_ui =[r_ui for r_ui in dataset_zeros['freq']]

predictions = [(uids[x], iids[x], algo.predict(uids[x], iids[x], r_ui[x], verbose=True)[3]) for x in range(len(dataset_zeros['userId']))]

user: 0          item: 386        r_ui = nan   est = 0.64   {'was_impossible': False}
user: 0          item: 387        r_ui = nan   est = 0.54   {'was_impossible': False}
user: 0          item: 388        r_ui = nan   est = 0.33   {'was_impossible': False}
user: 0          item: 389        r_ui = nan   est = 0.31   {'was_impossible': False}
user: 0          item: 390        r_ui = nan   est = 0.23   {'was_impossible': False}
user: 0          item: 391        r_ui = nan   est = 0.00   {'was_impossible': False}
user: 0          item: 392        r_ui = nan   est = 0.40   {'was_impossible': False}
user: 0          item: 393        r_ui = nan   est = 0.00   {'was_impossible': False}
user: 0          item: 394        r_ui = nan   est = 0.27   {'was_impossible': False}
user: 0          item: 395        r_ui = nan   est = 0.10   {'was_impossible': False}
user: 0          item: 396        r_ui = nan   est = 0.10   {'was_impossible': False}
user: 0          item: 397        r_ui = nan   est = 0

In [21]:
print(predictions)

[(0, 386, 0.6356989944131264), (0, 387, 0.5412493162101129), (0, 388, 0.3276706037688282), (0, 389, 0.3135833358946023), (0, 390, 0.22597024431264434), (0, 391, 0), (0, 392, 0.4028286073358618), (0, 393, 0), (0, 394, 0.26769101084384217), (0, 395, 0.09628307017009896), (0, 396, 0.10103575266344622), (0, 397, 0.012405806493363869), (0, 398, 0.021683554297411867), (0, 399, 0.19389248228299416), (0, 400, 0.18598129374756645), (0, 401, 0.1741761563772919), (0, 402, 0.21195159330685054), (0, 403, 0.06999748665349848), (0, 404, 0), (0, 405, 0.33120307752502876), (0, 406, 0.20438967011098796), (0, 407, 0.15960930002980156), (0, 408, 0.06676795051813444), (0, 409, 0.23182482215107103), (0, 410, 0.17454832763540518), (0, 411, 0.5186598533850857), (0, 412, 0), (0, 413, 0), (0, 414, 0.22859130747027923), (0, 415, 0.0013987396715977596), (0, 416, 0.15354958122329676), (0, 417, 0.24658127120634088), (0, 418, 0.03682226716176862), (0, 419, 0.12555107105756136), (0, 420, 0.4074334668382382), (0, 421,

In [40]:
# merge NaN values in original dataset with predicted values

predicted_df = pd.DataFrame(predictions, columns=['userId', 'artistId', 'freq'])

# dataset['freq'] = dataset[(dataset['userId'] == predicted_df['userId'] & dataset['artistId'] == predicted_df['artistId'])['freq']]



# dataset_no_zeros = dataset_no_zeros.drop(columns='userId')
# predicted_df = predicted_df.drop(columns='userId')

print(dataset_no_zeros)
print(predicted_df)

#final_vectors = pd.merge(dataset_no_zeros, predicted_df, on=['artistId'], how='inner')
#final_vectors = dataset_no_zeros.join(predicted_df, on='artistId')
final_vectors = pd.concat([dataset_no_zeros, predicted_df], ignore_index=True, sort=False)
print(final_vectors)

     userId  artistId      freq
0         0         0  8.224480
1         0         1  7.789066
2         0         2  6.773101
3         0         3  6.627963
4         0         4  4.547654
..      ...       ...       ...
531       1       531  0.048379
532       1       532  0.048379
533       1       533  0.048379
534       1       534  0.048379
535       1       535  0.048379

[698 rows x 3 columns]
     userId  artistId      freq
0         0       386  0.635699
1         0       387  0.541249
2         0       388  0.327671
3         0       389  0.313583
4         0       390  0.225970
..      ...       ...       ...
369       1       378  0.021105
370       1       381  0.135982
371       1       382  0.273068
372       1       383  0.431878
373       1       385  0.438708

[374 rows x 3 columns]
      userId  artistId      freq
0          0         0  8.224480
1          0         1  7.789066
2          0         2  6.773101
3          0         3  6.627963
4          0       

In [47]:
user_1_vector = final_vectors[final_vectors['userId'] == 0]['freq']
user_2_vector = final_vectors[final_vectors['userId'] == 1]['freq']

print(user_1_vector)
print(user_2_vector)

0      8.224480
1      7.789066
2      6.773101
3      6.627963
4      4.547654
         ...   
843    0.223495
844    0.273185
845    0.000000
846    0.416023
847    0.103519
Name: freq, Length: 536, dtype: float64
386     4.934688
387     0.290276
388     1.161103
389     1.838413
390     2.757620
          ...   
1067    0.021105
1068    0.135982
1069    0.273068
1070    0.431878
1071    0.438708
Name: freq, Length: 536, dtype: float64


In [49]:
from scipy.spatial.distance import cosine

In [50]:
user_similarity = cosine(user_1_vector, user_2_vector)
print(user_similarity)

0.5319256042324694
