In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [None]:
df = pd.read_csv('dataSetLimpo.csv')

def convert_to_list(string):

    if isinstance(string, float):
        return []

    string = string.strip()
    
    if not string:
        return []

    elements = string.split(',')
    elements = [element.strip() for element in elements]

    for i in range(len(elements)):
        element = elements[i]
        try:
            if '[' in element:
                element = str(element).replace('[', '')

            if ']' in element:
                element = str(element).replace(']','')

            if "'" in element:
                element = str(element).replace("'", '')

        except:
            pass

        elements[i] = element.lower().replace(" ","").replace("-","")

    return elements[:3]

df['TrackTags'] = df['TrackTags'].apply(convert_to_list)
df['AlbumTags'] = df['AlbumTags'].apply(convert_to_list)
df['ArtistTags'] = df['ArtistTags'].apply(convert_to_list)
df['SimilarArtists'] = df['SimilarArtists'].apply(convert_to_list)

In [None]:
uniqueTagList = list(set(df['TrackTags'].explode().values.flatten()))
uniqueTagList.insert(0, "TrackID")

print(uniqueTagList)

['TrackID', 'trap', 'hardrock', 'femalevocalists', 'punkrock', 'soul', 'postpunk', 'industrial', 'dancepop', 'latin', 'cloudrap', 'pop', '2020s', 'chamberpop', 'disco', 'jpop', 'funk', 'idm', 'singersongwriter', 'british', 'dreampop', 'cover', '90s', 'indiepop', 'psychedelic', 'psychedelicrock', 'piano', 'poprap', 'numetal', 'chill', 'beautiful', 'experimental', 'instrumental', 'poppunk', 'classicrock', 'korean', 'punk', 'kpop', 'house', 'alternativemetal', 'psychedelicpop', 'experimentalhiphop', 'neosoul', 'deathmetal', 'jazz', 'love', 'downtempo', 'thrashmetal', 'ambient', 'triphop', 'rock', 'posthardcore', 'blackmetal', 'hardcore', 'sad', 'progressiverock', 'newwave', '2010s', 'bossanova', 'country', 'progressivemetal', 'heavymetal', 'acoustic', 'postrock', 'lofi', '80s', 'ethereal', 'brazilian', 'metalcore', 'brasil', 'shoegaze', 'rnb', 'folk', 'japanese', 'emo', 'artpop', 'soundtrack', 'electronic', 'softrock', 'seenlive', 'indierock', 'hiphop', 'chillout', 'reggae', 'samba', 'mpb

In [None]:
track_columns = ['TrackID', 'AlbumID', 'ArtistID', 'TrackName', 'TrackTags', 'SimilarArtists', 'ArtistName', 'AlbumName']
track_df = df.loc[:, track_columns]

track_df['TrackTags'] = track_df['TrackTags'].apply(tuple)
track_df.drop_duplicates(subset='TrackID', keep="first", inplace=True)

In [None]:
tag_df = pd.DataFrame(columns=uniqueTagList)

for index, row in track_df.iterrows():
    trackValues = []
    for column in uniqueTagList:
        if column == "TrackID":
            trackValues.append(row['TrackID'])
        else:
            if (column in row['TrackTags']):
                trackValues.append(1)
            else:
                trackValues.append(0)

    tag_df.loc[len(tag_df)] = trackValues

tag_df = tag_df.drop_duplicates(subset='TrackID', keep="first")

tag_df.to_csv('C:\TCC\TagDataset.csv', index=False)

In [None]:
trackVector = tag_df.loc[:, tag_df.columns != 'TrackId']
sparseVector = sparse.csr_matrix(trackVector)

simMatrix = cosine_similarity(sparseVector)

In [None]:
def getArtistSimilarity(idEstimate, idCheck):
    estimateArtistId = track_df.loc[(track_df['TrackID'] == idEstimate), 'ArtistID'].iloc[0]
    checkArtistId = track_df.loc[(track_df['TrackID'] == idCheck), 'ArtistID'].iloc[0]
    estimateSimilarArtist = track_df.loc[(track_df['TrackID'] == idEstimate), 'SimilarArtists'].iloc[0]
    checkArtistName = track_df.loc[(track_df['TrackID'] == idCheck), 'ArtistName'].iloc[0]

    if (estimateArtistId == checkArtistId):
        return 1.5
    elif checkArtistName in estimateSimilarArtist:
        return 1
    else:
        return 0.8
    
def getAlbumSimilarity(idEstimate, idCheck):
    estimateAlbumId = track_df.loc[(track_df['TrackID'] == idEstimate), 'AlbumID'].iloc[0]
    checkAlbumId = track_df.loc[(track_df['TrackID'] == idCheck), 'AlbumID'].iloc[0]

    if (estimateAlbumId == checkAlbumId):
        return 1.5
    else:
        return 0.9    

Cálculo das Métricas de Erro

In [None]:
k_elements = 20
estimate = 0
userIdList = set(df['UserID'].explode().tolist())

estimativas = []
realval = []

userCount = 0
for user in userIdList:
    print(userCount)
    userTracks_df = df.loc[df['UserID'] == user]
    userTracks_df = userTracks_df.drop_duplicates(subset='TrackID', keep="first")
    userTracksId = set(userTracks_df['TrackID'].explode().tolist())

    for trackToEstimate in userTracksId:
        trackToEstimateIndex = tag_df.index[tag_df['TrackID'] == trackToEstimate]

        userTracksIndex = tag_df.index[tag_df['TrackID'].isin(userTracksId)].tolist()
        userTracksIndex.remove(trackToEstimateIndex)

        simVector = []
        for index in userTracksIndex:
            if index != trackToEstimateIndex:
                trackId =  tag_df.iloc[index, tag_df.columns.get_loc('TrackID')]
                tagSimilarity = simMatrix[trackToEstimateIndex][0][index]
                artistSimilarity = getArtistSimilarity(trackToEstimate, trackId)
                albumSimilarity = getAlbumSimilarity(trackToEstimate, trackId)
                similarity =  tagSimilarity * artistSimilarity * albumSimilarity
                simVector.append(similarity)

        simSeries = pd.Series(simVector, index=userTracksIndex)
        simSeries = simSeries.sort_values(ascending=False)
        simSeries = simSeries[:k_elements]

        total = 0
        sum = 0
        for index, value in simSeries.items():
            if (value > 0):
                trackId =  tag_df.iloc[index, tag_df.columns.get_loc('TrackID')]
                real = df.loc[(df['UserID'] == user) & (df['TrackID'] == trackId), 'ImplicitRating'].iloc[0]
                total += value
                sum += realRating * value

        estimate = 0
        if (total == 0):
            estimate = 0
        else:
            estimate = sum/total

        realRating = df.loc[(df['UserID'] == user) & (df['TrackID'] == trackToEstimate), 'ImplicitRating'].iloc[0]

        estimativas.append(estimate)
        realval.append(realRating)

    userCount += 1

print(estimativas)
print(realval)

In [None]:
i = 0
mae = 0
rmse = 0

while i < len(estimativas):
    mae += abs(realval[i] - estimativas[i])
    rmse += (realval[i] - estimativas[i])**2
    i += 1

mae = mae/len(estimativas)
rmse = math.sqrt(rmse/len(estimativas))

print("MAE - "+str(mae))
print("RMSE - "+str(rmse))

MAE - 0.020835452680988905
RMSE - 0.06060688004908351


Recomendação de músicas para o usuário '0'

In [None]:
user = 0
k_elements = 20
nRecomendations = 10

userTracks_df = df.loc[df['UserID'] == user]
userTracks_df = userTracks_df.drop_duplicates(subset='TrackID', keep="first")
userTracksId = set(userTracks_df['TrackID'].explode().tolist())

nonUserTracks_df = df.loc[df['UserID'] != user]
nonUserTracks_df = nonUserTracks_df.loc[~nonUserTracks_df['TrackID'].isin(userTracksId)]
nonUserTracks_df = nonUserTracks_df.drop_duplicates(subset='TrackID', keep="first")
nonUserTracksId = set(nonUserTracks_df['TrackID'].explode().tolist())

estimateRatings = []
estimateIndex = []
for trackToEstimate in nonUserTracksId:
    print("Track - "+str(len(estimateIndex))+" de "+str(len(nonUserTracksId)))

    trackToEstimateIndex = tag_df.index[tag_df['TrackID'] == trackToEstimate]
    estimateIndex.append(trackToEstimateIndex[0])

    userTracksIndex = tag_df.index[tag_df['TrackID'].isin(userTracksId)].tolist()

    simVector = []
    for index in userTracksIndex:
        trackId =  tag_df.iloc[index, tag_df.columns.get_loc('TrackID')]
        tagSimilarity = simMatrix[trackToEstimateIndex][0][index]
        artistSimilarity = getArtistSimilarity(trackToEstimate, trackId)
        albumSimilarity = getAlbumSimilarity(trackToEstimate, trackId)
        similarity =  tagSimilarity * artistSimilarity * albumSimilarity
        simVector.append(similarity)

    simSeries = pd.Series(simVector, index=userTracksIndex)
    simSeries = simSeries.sort_values(ascending=False)
    simSeries = simSeries[:k_elements]

    total = 0
    sum = 0
    for index, value in simSeries.items():
        if (value > 0):
            trackId =  tag_df.iloc[index, tag_df.columns.get_loc('TrackID')]
            trackRating = df.loc[(df['UserID'] == user) & (df['TrackID'] == trackId), 'ImplicitRating'].iloc[0]
            total += value
            sum += trackRating * value

    estimate = 0
    if (total == 0):
        estimate = 0
    else:
        estimate = sum/total
    
    estimateRatings.append(estimate)



In [None]:
ratingSeries = pd.Series(estimateRatings, index = estimateIndex)
ratingSeries = ratingSeries.sort_values(ascending = False)
topRatingSeries = ratingSeries[:nRecomendations]

recomendedIdList = []
for index, item in topRatingSeries.items():
    id = track_df.iloc[index, track_df.columns.get_loc('TrackID')]
    recomendedIdList.append(id)


recomendation_df = df.loc[df['TrackID'].isin(recomendedIdList)]
recomendation_df = recomendation_df.drop_duplicates(subset='TrackID', keep="first")
print(recomendedIdList)
print(recomendation_df)

30495    0.749482
30497    0.749482
30496    0.749482
54920    0.749482
1433     0.734802
23346    0.734802
3755     0.734802
3409     0.734802
4235     0.734802
4245     0.734802
dtype: float64
[67736, 67738, 67737, 180197, 3987, 7201, 9372, 8328, 10839, 10848]
        UserID  ArtistName         AlbumName      TrackName  \
1544        54  beachhouse  depressioncherry      spacesong   
3814       116  beachhouse             bloom           myth   
4243       134  beachhouse         teendream       takecare   
4836       160  beachhouse            become    devil'spool   
4851       160  beachhouse            become         become   
36970     1388  beachhouse        beachhouse   masterofnone   
52201     1946  beachhouse                 7  girloftheyear   
52202     1946  beachhouse                 7            woo   
52207     1946  beachhouse                 7     l'inconnue   
128316    4511  beachhouse                 7    drunkinl.a.   

                                          A