### Pre Processamento

In [None]:
import pandas as pd
import numpy as np
import heapq
from collections import defaultdict
from operator import itemgetter
import math

df = pd.read_csv('DatasetCompleto.csv')

df.rename(columns = {'User Id':'UserId', 'Artist Name':'ArtistName', 'Track ID':'TrackId', 'Album ID':'AlbumId', 'Album Name':'AlbumName', 'Track Name':'TrackName', 
                     'Artist ID':'ArtistId', 'Artist Top Tracks':'ArtistTopTracks', 'Artist Listeners':'ArtistListeners', 'Artist Playcount':'ArtistPlaycount',
                     'Artist Tags':'ArtistTags', 'Artist Top Albums':'ArtistTopAlbums', 'Track Duration':'TrackDuration', 'Track Listeners':'TrackListeners', 'Track Playcount':'TrackPlaycount',
                     'Track Tags':'TrackTags', 'Album Listeners':'AlbumListeners', 'Album Playcount':'AlbumPlaycount', 'Album Tags':'AlbumTags', 'Similar Artists':'SimilarArtists',}, inplace = True)

# Passar track duration para segundos
df['TrackDuration'] = df['TrackDuration'].div(1000)

# Apaga linhas sem nome de artista - irrecuperavel
df = df.drop(df[df['ArtistName']==''].index)

Fatorização das colunas de ID

In [None]:
# Criar um novo User ID que começa de 0 e vai ate 5000
df['UserID'] = pd.factorize(df['UserId'])[0]
df.insert(0, 'UserID', df.pop('UserID'))
df = df.drop(columns='UserId', axis=1)

def lowerCase_removeSpaces(text):
    # Check if the value is a float
    if isinstance(text, float):
        return ''

    return text.lower().replace(" ", "")

df['ArtistName'] = df['ArtistName'].apply(lowerCase_removeSpaces)
df['AlbumName'] = df['AlbumName'].apply(lowerCase_removeSpaces)
df['TrackName'] = df['TrackName'].apply(lowerCase_removeSpaces)

# Criano um novo Track ID combinando duas colunas
df['TrackID'] = pd.factorize(df['TrackName'] + df['ArtistName'])[0] 
df['AlbumID'] = pd.factorize(df['ArtistName'] + df['AlbumName'])[0] 
df['ArtistID'] = pd.factorize(df['ArtistName'] + str(df['ArtistListeners']))[0]

df = df.drop(columns = ['TrackId', 'AlbumId', 'ArtistId'], axis=1)

Cria coluna "Counts" e "NormalizedCounts"

In [None]:
# Cria a coluna counts que representa o numero de vezes que o usuario ouviu a musica
counts = df.groupby(df.columns.tolist()).transform('size')
df['Counts'] = counts
df['Counts'] = df['Counts'].fillna(1).astype(int)
df = df.drop_duplicates()


# Calculate the maximum count value for each user
max_counts = df.groupby('UserID')['Counts'].transform('max')

# Create a new column called 'normalized_counts' by dividing counts by the maximum count value for each user
df['NormalizedCounts'] = df['Counts'] / max_counts
df['MaxCounts'] = max_counts

In [None]:
def calculateRating(row):
    count = row.loc['Counts']
    max = row.loc['MaxCounts']

    diff = abs(max - count)
    rating = math.exp(-diff / 10)
    return rating

df['ImplicitRating'] = df.apply(lambda row: calculateRating(row), axis=1)

Deixa colunas com caixa baixa e sem espaços

Converte tags em listas

In [None]:
def convert_to_list(string):
    # Check if the value is a float
    if isinstance(string, float):
        return []

    # Remove leading and trailing whitespaces
    string = string.strip()
    
    # Check if the string is empty
    if not string:
        return []

    # Split the string by commas
    elements = string.split(',')
    # Remove leading and trailing whitespaces from each element
    elements = [element.strip() for element in elements]

    for i in range(len(elements)):
        element = elements[i]
        try:
            if '[' in element:
                element = str(element).replace('[', '')

            if ']' in element:
                element = str(element).replace(']','')

            if "'" in element:
                element = str(element).replace("'", '')

        except:
            pass

        elements[i] = element.lower().replace(" ","").replace("-","")

    return elements[:3]


# Apply the custom function to convert the strings to lists
df['TrackTags'] = df['TrackTags'].apply(convert_to_list)
df['AlbumTags'] = df['AlbumTags'].apply(convert_to_list)
df['ArtistTags'] = df['ArtistTags'].apply(convert_to_list)
df['SimilarArtists'] = df['SimilarArtists'].apply(convert_to_list)


Padroniza algumas tags

In [None]:
# Verifica quais são as 50 principais tags, vamos filtrar para aplicar one hot encoding
track_tags = df['TrackTags'].explode()
track_tags_flat = track_tags.values.flatten()

# É necessario juntar algumas tags que representam a mesma caracteristicas *brazilian, brazil brasil*
map = {'brasil': 'brazil',
       'brazilian': 'brazil',
       'Hip-Hop':'hiphop',
       'rap': 'hiphop',
       'k-pop':'kpop',
       'Kpop':'kpop',
       'mb': 'mpb'}

for i in range(len(track_tags_flat)):
    tag = track_tags_flat[i]

    if tag in map:
       tag = map[tag]

    track_tags_flat[i] = tag   


Filtra as top X tags

In [None]:
track_tags_flat = track_tags_flat.astype(str)
unique_values, counts = np.unique(track_tags_flat, return_counts=True)
print(f'Antes da filtragem de track tags temos {len(unique_values)} tags diferentes')
print(f'Antes da filtragem de track tags, temos um dataframe de {len(df)} linhas')


# Step 1: Value counts of the column
#value_counts = df.loc[df['TrackTags'] != '', 'TrackTags'].explode().value_counts()
value_counts = df['TrackTags'].explode().value_counts()

droppable_tags = ['1001747063611',
                '1001819731063',
                'fip',
                'loveatfirstlisten',
                'myspotigrambot',
                '']

value_counts.drop(droppable_tags, inplace=True)

# Step 2: Select top 100 values
top_100_values = value_counts.head(100)

# Step 3: Extract index values
top_100_index = top_100_values.index

# Step 4: Create boolean mask
mask = df['TrackTags'].apply(lambda x: any(elem in top_100_index for elem in x))

# Step 5: Filter DataFrame
df = df[mask]

# Step 6: Optionally remove unused values from the column
df['TrackTags'] = df['TrackTags'].apply(lambda x: [elem for elem in x if elem in top_100_index])

# Verifica quais são as 50 principais tags, vamos filtrar para aplicar one hot encoding
track_tags = df['TrackTags'].explode()
track_tags_flat = track_tags.values.flatten()

unique_values, counts = np.unique(track_tags_flat, return_counts=True)
print(f'Apos filtragem, existem {len(unique_values)} tags diferentes no dataframe')
print(f'Apos filtragem, temos dataframe de {len(df)} linhas')

Antes da filtragem de track tags temos 17373 tags diferentes
Antes da filtragem de track tags, temos um dataframe de 387220 linhas
Apos filtragem, existem 100 tags diferentes no dataframe
Apos filtragem, temos dataframe de 230091 linhas


Tira linhas que contem 0 Track Tags

In [None]:
# Locate rows with an empty list in the Track Tags column
df = df[df['TrackTags'].apply(lambda x: isinstance(x, list) and x[0] != '')]
print(f'Apos filtragem temos {len(df)} linhas')

#Atualiza Normalized Counts
# Calculate the maximum count value for each user
max_counts = df.groupby('UserID')['Counts'].transform('max')

# Create a new column called 'normalized_counts' by dividing counts by the maximum count value for each user
df['NormalizedCounts'] = df['Counts'] / max_counts
df['MaxCounts'] = max_counts
df['ImplicitRating'] = df.apply(lambda row: calculateRating(row), axis=1)

Apos filtragem temos 230091 linhas


Remover usuarios com menos que X musicas registradas

In [None]:
print(f'Antes do filtro, dataframe contem {len(df.groupby("UserID").count())} usuarios diferentes')

x = 50  # Set the desired number of different TrackIds

user_counts = df.groupby('UserID')['TrackID'].nunique()
users_with_few_track_ids = user_counts[user_counts < x].index.tolist()

print("Users with fewer than", x, "different TrackIds:")
print(users_with_few_track_ids)
print()
print(f'Existem {len(users_with_few_track_ids)} usuarios com numero de musicas registradas menor do que {x}')
print()
df = df[~df['UserID'].isin(users_with_few_track_ids)]

print(f'Apos o filtro, o dataframe contem {len(df.groupby("UserID").count())} usuarios diferentes')

Antes do filtro, dataframe contem 4935 usuarios diferentes
Users with fewer than 50 different TrackIds:
[1, 4, 7, 10, 14, 19, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 40, 44, 45, 46, 47, 49, 50, 51, 53, 55, 58, 66, 67, 70, 74, 76, 77, 84, 88, 93, 95, 96, 97, 98, 99, 100, 102, 103, 104, 105, 109, 113, 114, 115, 117, 119, 120, 121, 122, 123, 124, 125, 126, 127, 129, 130, 135, 136, 139, 140, 141, 142, 143, 145, 146, 148, 149, 150, 151, 154, 155, 156, 158, 161, 166, 167, 169, 174, 178, 181, 182, 184, 187, 188, 190, 195, 196, 198, 199, 201, 204, 205, 206, 208, 209, 210, 212, 215, 221, 224, 225, 226, 229, 231, 232, 233, 234, 235, 238, 242, 245, 246, 247, 248, 249, 250, 252, 254, 255, 256, 257, 258, 259, 260, 267, 268, 269, 270, 271, 272, 274, 279, 280, 282, 284, 285, 286, 287, 288, 289, 290, 291, 293, 294, 297, 298, 299, 300, 302, 305, 306, 309, 314, 315, 317, 319, 320, 323, 324, 326, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 342, 343, 344, 345, 346,

In [None]:
df.to_csv('dataSetLimpo.csv', index=False)