## Importing the dependencies

In [1]:
import numpy as np
import distance

import os
import datetime
import sys 

import pandas as pd
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
from collections import Counter

sys.path.append('../src')
from utilities.utils1 import OneHotEncoder, clean_and_filter
from data.data_cleaner import merge_near_string

## Loading the interi dataset

In [2]:
df = pd.read_csv('../data/interim/lakh/lakh.csv', usecols=['id','song_name','album_name','artist_name','artist_mb','tag_echo','tag_mbz','year'])
df =  df.fillna('?')
df

Unnamed: 0,id,song_name,album_name,artist_name,artist_mb,tag_echo,tag_mbz,year
0,TRRRUFD12903CD7092,Wastelands,Alien 4,Hawkwind,5a28f8c2-31fb-4047-ae57-c5c326989262,space rock,british,1994
1,TRRRUTV12903CEA11B,Runaway,Songs of Del Shannon,Del Shannon,2e885bfb-1f59-49cf-8d51-e743445e1b48,ballad,classic pop and rock,1961
2,TRRRUJO128E07813E7,Have You Met Miss Jones? (Swing When Version),Swing When You're Winning,Robbie Williams,db4624cf-0e44-481e-a9dc-2142b833ec2f,british pop,pop,2001
3,TRRRIYO128F428CF6F,Goodbye,Bittersweet,Volebeats,eb567c55-368d-4b85-b969-ca9e3252f9cb,alternative country,?,0
4,TRRRILO128F422FFED,La Colegiala,Musica Tropical De Colombia 5,Rodolfo Y Su Tipica Ra7,ead8d6d9-e58b-4dd8-916f-cf7f359db38e,cumbia,?,1997
...,...,...,...,...,...,...,...,...
31029,TRWWYHD12903CC42B1,Gethsemane (I Only Want to Say) (Live-LP Version),In Concert,Michael Crawford,2c654643-cdbd-4634-be26-c4d1a90cfabd,opera,?,0
31030,TRWWYNJ128F426541F,Cold Feelings,Somewhere Between Heaven And Hell,Social Distortion,e1e05cce-3922-44e1-8f20-015abe5e309d,hard rock,punk,1992
31031,TRWWPSV128F4244C71,Ases Death,Dragonchaser,At Vance,17828264-0f4a-40b3-bfc5-8544f30debed,power metal,?,2001
31032,TRWWPBK128F42911E9,Drowned Maid,Chapters,Amorphis,efaefde1-e09b-4d49-9d8e-b1304d2ece8d,progressive metal,finnish,1993


## What are the number of unique genres in the dataset?

In [3]:
df['tag_mbz'].nunique()

485

In [4]:
#merge those genres which are similar
merge_near_string('tag_mbz', df)

{'funk rock': 'punk rock',
 'electronica': 'electronic',
 'italia': 'italian',
 'hip hop': 'hip-hop',
 'hiphop': 'hip-hop',
 'australian': 'australia',
 'australie': 'australia',
 'france': 'trance',
 'synth-pop': 'synthpop',
 'orchestral': 'orchestra',
 'popera': 'opera',
 'austria': 'austrian',
 'dark wave': 'darkwave',
 'post rock': 'post-rock',
 'argentine': 'argentina',
 'post-hardcore': 'post hardcore',
 'electropop': 'electro pop'}

In [5]:
#load the embeddings
emb = KeyedVectors.load_word2vec_format('../data/interim/lmd.bin', binary=False)

In [6]:
'''extracts embedding vectors from the emb dictionary based on the 'id' values
 present in the DataFrame df, and stores them in a numpy array called data'''
ids = [i for i in df.id.tolist() if i in emb]
data = np.array([emb[i] for i in ids])
data

array([[-0.17479022,  0.07378042,  0.49907172, ...,  0.31828856,
         0.5608369 ,  0.12442618],
       [-0.33292776,  0.02196981,  0.68885314, ..., -0.17462437,
         0.20948799,  0.12133545],
       [ 0.11866359, -0.3533733 ,  0.40039137, ..., -0.15811822,
        -0.12684418,  0.21303603],
       ...,
       [-0.07274715, -0.10548753,  0.2216787 , ..., -0.47450665,
         0.75326884,  0.48820785],
       [ 0.4209376 ,  0.41637152,  0.71112645, ...,  0.03877402,
         0.19865389, -0.05724512],
       [-0.09821663, -0.52757484,  0.2697266 , ...,  0.76983595,
         0.4365861 ,  0.08382966]], dtype=float32)

In [7]:
'''difference between the total number of 'id' values in df and 
the number of embedding vectors collected into the data array'''
len(df.id.tolist()) - len(data)

59

In [9]:
df = df[df.id.isin(ids)]
df.shape

(30975, 8)

In [10]:
data_dir = 'data'
processed_dir = os.path.join(data_dir, 'processed')
os.makedirs(processed_dir, exist_ok=True)
csv_file_path = os.path.join(processed_dir, 'processed.csv')
df.to_csv(csv_file_path, index=False)
print(f"DataFrame saved to: {csv_file_path}")

DataFrame saved to: data/processed/processed.csv
