In [1]:
import pandas as pd
from surprise import SVDpp
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict
import pickle

In [2]:
triplets =  pd.read_csv('../data/raw/TasteProfile/train_triplets.txt', sep='\t', names=['userID', 'songID', 'playCount'], nrows=50000)
triplets['rating'] = triplets['playCount'].apply(lambda x: x/(max(triplets['playCount'])/5))

#Usamos Reader() del paquete Surprise para poner los datos en el formato que nos piden los algoritmos
reader = Reader(rating_scale=(min(triplets['playCount']), max(triplets['playCount'])/5))

In [3]:
triplets.drop(columns=['playCount'], inplace=True)

In [5]:
triplets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userID  50000 non-null  object 
 1   songID  50000 non-null  object 
 2   rating  50000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.1+ MB


In [4]:
data = Dataset.load_from_df(triplets, reader)

#Separo en train y test
train, test = train_test_split(data, test_size=0.25, random_state=42)
#trainset = data.build_full_trainset()

In [6]:
file = open('model_small.pickle', 'rb')
model = pickle.load(file)


#preds = model.test(test)

#Métricas de evaluacin
#accuracy.mae(preds)
#accuracy.rmse(preds)
#accuracy.mse(preds)
#model.bi

In [10]:
from surprise.similarities import pearson

pearson()

TypeError: pearson() takes exactly 3 positional arguments (0 given)

In [8]:
recs = pd.DataFrame(columns=['songID', 'prediction'])
userID = triplets['userID'].unique()[16]
songs = triplets['songID'].unique()
len(songs)

model.test(test)



[Prediction(uid='2475b003df9c084c1488b2e63666e8cc87112180', iid='SONYYEV12A6701E109', r_ui=0.0026455026455026454, est=0.0026455026455026454, details={'was_impossible': False}),
 Prediction(uid='9b51a95afc171f1c44245ace88fa930c0e63beaa', iid='SODLNTE12AF72AD26A', r_ui=0.007936507936507936, est=0.0026455026455026454, details={'was_impossible': False}),
 Prediction(uid='76235885b32c4e8c82760c340dc54f9b608d7d7e', iid='SOUPKIK12AF72A69E5', r_ui=0.0026455026455026454, est=0.00482227299840433, details={'was_impossible': False}),
 Prediction(uid='16f5dc37b96c153c462bf306ceef36112d36346e', iid='SOQVMPG12A6701E8AA', r_ui=0.0026455026455026454, est=0.011488280950885969, details={'was_impossible': False}),
 Prediction(uid='25fc0200450bbf726c8511fabe31ecfdb81732eb', iid='SOILFUU12AB017C75F', r_ui=0.021164021164021163, est=0.025138727953387195, details={'was_impossible': False}),
 Prediction(uid='6df748807f979d27c638229f6dd1d39a9a0b7f61', iid='SOZDGEW12A8C13E748', r_ui=0.0026455026455026454, est=0.0

In [120]:
max(triplets['rating'])

5.0

In [9]:
recs = pd.DataFrame(columns=['songID', 'r_ui', 'est', 'details'])
for i in songs:
    prediction = model.predict(userID, i)
    print('User', userID, 'SongID', i, prediction)
    recs = recs.append({'songID': i, 'r_ui': prediction[2], 'est': prediction[3], 'details': prediction[4]}, ignore_index=True)

User baf47ed8da24d607e50d8684cde78b923538640f SongID SOAKIMP12A8C130995 user: baf47ed8da24d607e50d8684cde78b923538640f item: SOAKIMP12A8C130995 r_ui = None   est = 0.01   {'was_impossible': False}
User baf47ed8da24d607e50d8684cde78b923538640f SongID SOAPDEY12A81C210A9 user: baf47ed8da24d607e50d8684cde78b923538640f item: SOAPDEY12A81C210A9 r_ui = None   est = 0.00   {'was_impossible': False}
User baf47ed8da24d607e50d8684cde78b923538640f SongID SOBBMDR12A8C13253B user: baf47ed8da24d607e50d8684cde78b923538640f item: SOBBMDR12A8C13253B r_ui = None   est = 0.03   {'was_impossible': False}
User baf47ed8da24d607e50d8684cde78b923538640f SongID SOBFNSP12AF72A0E22 user: baf47ed8da24d607e50d8684cde78b923538640f item: SOBFNSP12AF72A0E22 r_ui = None   est = 0.00   {'was_impossible': False}
User baf47ed8da24d607e50d8684cde78b923538640f SongID SOBFOVM12A58A7D494 user: baf47ed8da24d607e50d8684cde78b923538640f item: SOBFOVM12A58A7D494 r_ui = None   est = 0.00   {'was_impossible': False}
User baf47ed8da

In [126]:
print(recs['est'].min(), recs['est'].max())
print(triplets['rating'].min(), triplets['rating'].max())

0.0026455026455026454 0.5765286778617237
0.0026455026455026454 5.0


In [20]:
from surprise.model_selection import cross_validate

triplets =  pd.read_csv('../data/raw/TasteProfile/train_triplets.txt', sep='\t', names=['userID', 'itemID', 'rating'], nrows=500000)


#Usamos Reader() del paquete Surprise para poner los datos en el formato que nos piden los algoritmos
reader = Reader(rating_scale=(max(triplets['rating']), min(triplets['rating'])), sep='\t')

data = Dataset.load_from_df(triplets, reader=reader)

# We can now use this dataset as we please, e.g. calling cross_validate
#cross_validate(SVDpp(), data, verbose=True)

In [22]:
trainset = data.build_full_trainset()

In [18]:
model.test(test)

[Prediction(uid='e9b86ca8cd62f072419112174f1a9427dd40aed7', iid='SONWEHY12A58A796B5', r_ui=8.0, est=5, details={'was_impossible': False}),
 Prediction(uid='818d363c191b02180f51d8569eb65ee8f2bdf888', iid='SOLRGNF12AB0187CF4', r_ui=1.0, est=5, details={'was_impossible': False}),
 Prediction(uid='a91f68a73e9bda71a0d6813733357bfeb3fb84e4', iid='SOENSSK12A8AE478BF', r_ui=1.0, est=5, details={'was_impossible': False}),
 Prediction(uid='2e9d0027ffacde0c33060f4bc214664215804e9c', iid='SOXHUQJ12A8C13C50E', r_ui=3.0, est=5, details={'was_impossible': False}),
 Prediction(uid='26a456d4c8de2376f69a87e6adb581e5a948479a', iid='SOEVOXS12A8C1398CE', r_ui=1.0, est=5, details={'was_impossible': False}),
 Prediction(uid='9889f7c0af7388bad9e91800476e2716866e6417', iid='SOEYSPT12A6D4F76AF', r_ui=4.0, est=5, details={'was_impossible': False}),
 Prediction(uid='a520488fcf049bbb5cd847cfa4f884c740692780', iid='SOXXXTF12A6D4F99BF', r_ui=1.0, est=5, details={'was_impossible': False}),
 Prediction(uid='c9c71a6525

---------------------------------------

In [None]:
df_rating =  pd.read_csv('../data/raw/TasteProfile/train_triplets.txt', sep='\t', names=['userID', 'songID', 'playCount'])

In [None]:
df_rating_table = df_rating[:10000].pivot_table(index='userID', columns='songID', values='playCount')
df_rating_table

In [11]:
#% de la tabla con valores vacios
df_rating_table.isna().values.sum() / df_rating_table.size

0.9926355627203347

In [39]:
#Saco el test de la tabla
test_table = df_rating_table.iloc[:10, :235].copy().values

In [40]:
test_table[~np.isnan(test_table)]

array([1., 1., 4., 1., 1., 1., 7., 1., 1., 3., 1., 1., 2., 3., 1., 1., 1.])

In [36]:
~test_table.isna()

AttributeError: 'numpy.ndarray' object has no attribute 'isna'

In [2]:
import math as mt
import csv
from sparsesvd import sparsesvd #used for matrix factorization
import numpy as np
from scipy.sparse import csc_matrix #used for sparse matrix
from scipy.sparse.linalg import * #used for matrix multiplication

In [3]:
#constants defining the dimensions of our User Rating Matrix (URM)
MAX_PID = 4
MAX_UID = 5

#Compute SVD of the user ratings matrix
def computeSVD(urm, K):
    U, s, Vt = sparsesvd(urm, K)

    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(np.transpose(U), dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)

    return U, S, Vt

#Compute estimated rating for the test user
def computeEstimatedRatings(urm, U, S, Vt, uTest, K, test):
    rightTerm = S*Vt

    estimatedRatings = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        #we convert the vector to dense format in order to get the indices
        #of the movies with the best estimated ratings
        estimatedRatings[userTest, :] = prod.todense()
        recom = (-estimatedRatings[userTest, :]).argsort()[:250]
    return recom

In [4]:
#Used in SVD calculation (number of latent factors)
K=2

#Initialize a sample user rating matrix
urm = np.array([[3, 1, 2, 3],[4, 3, 4, 3],[3, 2, 1, 5], [1, 6, 5, 2], [5, 0,0 , 0]])
urm = csc_matrix(urm, dtype=np.float32)

#Compute SVD of the input user ratings matrix
U, S, Vt = computeSVD(urm, K)

#Test user set as user_id 4 with ratings [0, 0, 5, 0]
uTest = [4]
print("User id for whom recommendations are needed: %d" % uTest[0])

#Get estimated rating for test user
print("Predictied ratings:")
uTest_recommended_items = computeEstimatedRatings(urm, U, S, Vt, uTest, K, True)
print(uTest_recommended_items)

User id for whom recommendations are needed: 4
Predictied ratings:
[0 3 2 1]
