## Filtro Colaborativo

COSINE
K NEAREST NEIGHBOURS
USER BASED

In [93]:
import pandas as pd

df = pd.read_csv('dataSetLimpo.csv')

from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import KNNWithMeans
from surprise.model_selection import train_test_split

# Utiliza o objeto Reader para converter o dataframe em um objeto Dataset da biblioteca surprise
reader = Reader()
data = Dataset.load_from_df(df[['UserID', 'TrackID', 'ImplicitRating']], reader)

# Constroi o set de treino
trainSet = data.build_full_trainset()

sim_options = {'name': 'cosine',
                'user_based': True}

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()
print(simsMatrix)
print()

k = 20
testSubject = 0

# só serão escolhidos usuarios que compartilharem pelo menos uma musica com o usuario teste
minThresholdSongs = 3

# Get top N similar users to our test subject
testUserInnerID = trainSet.to_inner_uid(testSubject)
similarityRow = simsMatrix[testUserInnerID]

# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1

similarUsers = []
for innerID, score in enumerate(similarityRow):
    if innerID != testUserInnerID:
        theirRatings = trainSet.ur[innerID]
        commonSongs = [rating[0] for rating in theirRatings if rating[0] in watched]
        if len(commonSongs) >= minThresholdSongs:
            similarUsers.append((innerID, score))

print(f'after applying threshold: {len(similarUsers)}')

kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
print()

# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1]) * userSimilarityScore

numberOfTracks = 30
# Get top-rated items from similar users:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        trackID = trainSet.to_raw_iid(itemID)
        print(df.loc[df['TrackID'] == trackID, ['TrackName', 'ArtistName', 'TrackTags', 'ArtistTags', 'SimilarArtists']].values[0], ratingSum)
        pos += 1
        if (pos > numberOfTracks):
            break

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 1. 0. 1.]
 [0. 0. 1. ... 0. 1. 0.]
 [0. 0. 0. ... 1. 0. 1.]]

after applying threshold: 49

['seigfried' 'frankocean' list(['dreampop', 'rnb', 'neosoul'])
 list(['rnb', 'soul', 'hiphop'])
 list(['tyler', 'thecreator', 'childishgambino'])] 6.440473979636765
['thinkinboutyou' 'frankocean' list(['rnb', 'soul'])
 list(['rnb', 'soul', 'hiphop'])
 list(['tyler', 'thecreator', 'childishgambino'])] 6.1600650405434765
['lost' 'frankocean' list(['rnb', 'soul']) list(['rnb', 'soul', 'hiphop'])
 list(['tyler', 'thecreator', 'childishgambino'])] 5.636129090527508
['earfquake' 'tyler,thecreator' list(['neosoul', 'soul', 'alternative'])
 list(['hiphop', 'rap', 'ofwgkta'])
 list(['ofwgkta', 'kendricklamar', 'childishgambino'])] 4.881731608578837
['badreligion

### Evaluation of User Based

In [94]:
from surprise.model_selection import LeaveOneOut
import pandas as pd
import numpy as np
from surprise import KNNBasic
import heapq
from collections import defaultdict
from operator import itemgetter
import math



# Utiliza o objeto Reader para converter o dataframe em um objeto Dataset da biblioteca surprise
reader = Reader()
data = Dataset.load_from_df(df[['UserID', 'TrackID', 'NormalizedCounts']], reader)


sim_options = {'name': 'cosine',
            'user_based': True}

model = KNNBasic(sim_options=sim_options)

# Create a test set with leave-one-out strategy
loo = LeaveOneOut(n_splits=10)
hits = 0
total = 0


for i in range(100):
    for trainSet, testSet in loo.split(data):

        # Build the model using the training set
        model.fit(trainSet)
        simsMatrix = model.compute_similarities()
        print(simsMatrix)
        print()

        # Get the user from the test set
        testUser = testSet[i][0]

        k = 20

        # só serão escolhidos usuarios que compartilharem pelo menos uma musica com o usuario teste
        minThresholdSongs = 3

        # Get top N similar users to our test subject
        testUserInnerID = trainSet.to_inner_uid(testUser)
        similarityRow = simsMatrix[testUserInnerID]

        # Build a dictionary of stuff the user has already seen
        watched = {}
        for itemID, rating in trainSet.ur[testUser]:
            watched[itemID] = 1

        similarUsers = []
        for innerID, score in enumerate(similarityRow):
            if innerID != testUserInnerID:
                theirRatings = trainSet.ur[innerID]
                commonSongs = [rating[0] for rating in theirRatings if rating[0] in watched]
                if len(commonSongs) >= minThresholdSongs:
                    similarUsers.append((innerID, score))

        kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
        print()

        # Calculate the hit rate for the current test user
        userHits = 0
        userTotal = 0

        # Get top-rated items from similar users
        candidates = defaultdict(float)
        for similarUser in kNeighbors:
            innerID = similarUser[0]
            userSimilarityScore = similarUser[1]
            theirRatings = trainSet.ur[innerID]
            for rating in theirRatings:
                candidates[rating[0]] += (rating[1]) * userSimilarityScore


        numberOfTracks = 30
        pos = 0
        for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
            if not itemID in watched:
                trackID = trainSet.to_raw_iid(itemID)
                print(df.loc[df['TrackID'] == trackID, ['TrackName', 'ArtistName', 'ArtistTags', 'TrackTags']].values[0], ratingSum)

                if pos < numberOfTracks:
                    if trackID == testSet[0][1]:
                        userHits += 1
                    pos += 1
                else:
                    break

        userTotal += 1

        # Accumulate the hits and total across all test users
        hits += userHits
        total += userTotal

# Calculate the overall hit rate
hit_rate = hits / total
print(f"Overall Hit rate: {hit_rate}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 1. 0. 1.]
 [0. 0. 1. ... 0. 1. 0.]
 [0. 0. 0. ... 1. 0. 1.]]


['seigfried' 'frankocean' list(['rnb', 'soul', 'hiphop'])
 list(['dreampop', 'rnb', 'neosoul'])] 3.5892680350813246
['thinkinboutyou' 'frankocean' list(['rnb', 'soul', 'hiphop'])
 list(['rnb', 'soul'])] 3.280130573620124
['earfquake' 'tyler,thecreator' list(['hiphop', 'rap', 'ofwgkta'])
 list(['neosoul', 'soul', 'alternative'])] 2.69767498235859
['seeyouagain(feat.kaliuchis)' 'tyler,thecreator'
 list(['hiphop', 'rap', 'ofwgkta']) list(['rap', 'rnb'])] 2.662250448649376
['badreligion' 'frankocean' list(['rnb', 'soul', 'hiphop'])
 list(['beautiful', 'rnb', 'soul'])] 2.65
['lost' 'frankocean' list(['rnb', 'soul', 'hiphop']) list(['rnb', 'soul'])] 2.5891222579282007
['smallworlds' 'mac

COSINE
K NEIGHBOURHOOD
ITEM BASED


Complexidade para um unico usuario:
calcular pesos entre esse usuario e todos os outros (O(MN))
calcular scores  para todos os items O(MN) (M items, somar N termos para cada item)
Sort os scores O(MlogM)
Total = O(MN) + O(MlogM)

Calcular similaridade é de complexidade O(MN)

48.2 GB de RAM (impossivel)

Ideia: Subset com top n users e top m musicas
Outra ideia: Precomputar User-User Weights
Outra ideia: Para o score, apenas somar os scores de top K users (K Neighbours  )