In [None]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import ijson
import json
from decimal import Decimal

input_file = './json/song.json'
output_file = './json/song.parquet'

# Helper conversion functions
def convert_decimal(obj):
    if isinstance(obj, list):
        return [convert_decimal(i) for i in obj]
    elif isinstance(obj, dict):
        return {k: convert_decimal(v) for k, v in obj.items()}
    elif isinstance(obj, Decimal):
        return float(obj)
    else:
        return obj

# Explicit column casting
def enforce_types(df):
    # List columns to explicitly cast
    int_cols = ['rank', 'position']
    float_cols = ['confidence', 'duration', 'arousal_predicted', 'valence_predicted', 'gain', 'bpm']
    
    for col in int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')

    for col in float_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)

    # Convert all complex objects explicitly to strings
    json_cols = ['deezer_mapping', 'chords_metadata', 'availableCountries', 'animux_paths']
    for col in json_cols:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x)
    
    return df

batch_size = 50000
batch = []
schema = None
writer = None

with open(input_file, 'rb') as f:
    parser = ijson.items(f, 'item')

    for i, record in enumerate(parser, 1):
        record = convert_decimal(record)
        batch.append(record)

        if i % batch_size == 0:
            df_batch = pd.json_normalize(batch)

            df_batch = enforce_types(df_batch)

            if schema is None:
                table = pa.Table.from_pandas(df_batch, preserve_index=False)
                schema = table.schema
                writer = pq.ParquetWriter(output_file, schema, compression='snappy')
            else:
                # Add missing columns with None values
                for col in schema.names:
                    if col not in df_batch.columns:
                        df_batch[col] = None
                df_batch = df_batch[schema.names]
                table = pa.Table.from_pandas(df_batch, schema=schema, preserve_index=False)

            writer.write_table(table)
            batch = []

    # Final batch
    if batch:
        df_batch = pd.json_normalize(batch)
        df_batch = enforce_types(df_batch)

        if schema is None:
            table = pa.Table.from_pandas(df_batch, preserve_index=False)
            schema = table.schema
            writer = pq.ParquetWriter(output_file, schema, compression='snappy')
        else:
            for col in schema.names:
                if col not in df_batch.columns:
                    df_batch[col] = None
            df_batch = df_batch[schema.names]
            table = pa.Table.from_pandas(df_batch, schema=schema, preserve_index=False)

        writer.write_table(table)

# Close writer explicitly
if writer:
    writer.close()

In [33]:
import pandas as pd
import ijson

input_file = './json/song.json'
output_file = './json/song.csv'

# Open JSON file as stream
with open(input_file, "rb") as f:
    objects = ijson.items(f, "item")  # Adjust 'item' if JSON is nested

    batch_size = 50000
    batch = []
    
    for i, obj in enumerate(objects, 1):
        batch.append(obj)
        
        if i % batch_size == 0:
            df = pd.json_normalize(batch)
            df.to_csv(output_file, mode="a", index=False, header=(i == batch_size))  # Append, write header only once
            batch = []

    if batch:  # Final batch
        df = pd.json_normalize(batch)
        df.to_csv(output_file, mode="a", index=False, header=False)


In [1]:
import polars as pl

df = pl.read_parquet('./json/song.parquet')

In [13]:
df.head()

position,lengthAlbum,urlSong,lyrics,urlWikipedia,isClassic,urlAllmusic,urlMusicBrainz,title,publicationDateAlbum,albumTitle,deezer_mapping,id_song_deezer,isrc,length,explicitLyrics,rank,bpm,gain,preview,availableCountries,publicationDate,rdf,urlPandora,urlITunes,urlSpotify,urlYouTube,urlAmazon,urlHypeMachine,urlGoEar,urlLastFm,multitrack_path,multitrack_file,id_song_musicbrainz,disambiguation,language,begin,end,id_artist_deezer,id_album_deezer,urlDeezer,language_detect,name,title_accent_fold,animux_paths,arousal,arousal_predicted,album_genre,has_emotion_tags,has_social_tags,lastfm_id,valence,valence_predicted,_id.$oid,id_album.$oid,chords_metadata.confidence,chords_metadata.duration,chords_metadata.chordSequence,chords_metadata._id,summary,id_album
i64,str,str,str,str,bool,str,str,str,str,str,str,str,str,str,bool,i64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,f64,str,str,f64,f64,list[struct[3]],str,list[str],str
0,"""57:52""","""http://lyrics.wikia.com/A:Turn…","""Turn it up<br>I don&apos;t kno…","""""",False,"""http://www.allmusic.com/song/m…","""http://musicbrainz.org/recordi…","""Turn It Up""","""1997""","""How Ace Are Buildings""","""[[67354194, ""search-exact""]]""","""67354194""","""GBAAP9700050""","""93""",False,261631,77.0,-29.4,"""http://e-cdn-preview-8.deezer.…","""[""AD"", ""AE"", ""AG"", ""AI"", ""AL"",…","""1998-06-22""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""3db608e4-eb72-437e-bc52-872f0a…","""""","""""","""""","""""","""3412""","""6585846""","""http://www.deezer.com/track/67…","""english""","""A""","""Turn It Up""","""[]""","""""",0.469772,"""Alternative Rock""","""False""","""False""","""""","""""",0.657853,"""5714dec325ac0d8aee3804e7""","""5714debb25ac0d8aee34d59a""",0.746509,93.023107,"[{16.15,""Amaj"",0.0}, {18.05,""Dmaj"",16.15}, … {93.0,""Amaj7"",86.15}]","""deezer:67354194""",,
1,"""57:52""","""http://lyrics.wikia.com/A:Fogh…","""Sick of you, how old do you th…","""""",False,"""http://www.allmusic.com/song/m…","""http://musicbrainz.org/recordi…","""Foghorn""","""1997""","""How Ace Are Buildings""","""[[67354196, ""search-exact""]]""","""67354196""","""GBAAP9700051""","""184""",False,297455,106.0,-10.6,"""http://e-cdn-preview-0.deezer.…","""[""AD"", ""AE"", ""AG"", ""AI"", ""AL"",…","""1998-06-22""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""84feea0c-187f-40f1-bb29-4818da…","""""","""""","""""","""""","""3412""","""6585846""","""http://www.deezer.com/track/67…","""english""","""A""","""Foghorn""","""[]""","""""",0.044004,"""Alternative Rock""","""False""","""False""","""""","""""",-0.810233,"""5714dec325ac0d8aee3804e8""","""5714debb25ac0d8aee34d59a""",0.80717,184.007596,"[{1.65,""E7"",0.0}, {3.75,""Bmaj7"",1.65}, … {184.0,""Bmaj"",174.65}]","""deezer:67354196""",,
2,"""57:52""","""http://lyrics.wikia.com/A:Chee…","""My name is Jason<br>I&apos;ll …","""""",False,"""http://www.allmusic.com/song/m…","""http://musicbrainz.org/recordi…","""Cheeky Monkey""","""1997""","""How Ace Are Buildings""","""[[67354198, ""search-exact""]]""","""67354198""","""GBAAP9700052""","""216""",False,268232,185.0,-10.4,"""http://e-cdn-preview-e.deezer.…","""[""AD"", ""AE"", ""AG"", ""AI"", ""AL"",…","""1998-06-22""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""f9303efd-b512-4236-b83e-2fff0a…","""""","""""","""""","""""","""3412""","""6585846""","""http://www.deezer.com/track/67…","""english""","""A""","""Cheeky Monkey""","""[]""","""""",0.487166,"""Alternative Rock""","""False""","""False""","""""","""""",0.223842,"""5714dec325ac0d8aee3804e9""","""5714debb25ac0d8aee34d59a""",0.714021,216.007596,"[{1.55,""Emaj"",0.0}, {2.35,""Dmaj"",1.55}, … {216.0,""Bmin7"",212.55}]","""deezer:67354198""",,
3,"""57:52""","""http://lyrics.wikia.com/A:No._…","""Got to get out more<br>Get in …","""""",False,"""http://www.allmusic.com/song/m…","""http://musicbrainz.org/recordi…","""No. 1""","""1997""","""How Ace Are Buildings""","""[[67354199, ""search-exact""]]""","""67354199""","""GBAAP9700053""","""230""",False,308436,96.0,-10.6,"""http://e-cdn-preview-6.deezer.…","""[""AD"", ""AE"", ""AG"", ""AI"", ""AL"",…","""1998-06-22""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""52ef8537-c61d-4163-a6cc-737cac…","""""","""""","""""","""""","""3412""","""6585846""","""http://www.deezer.com/track/67…","""english""","""A""","""No. 1""","""[]""","""""",0.20963,"""Alternative Rock""","""False""","""False""","""""","""""",-0.016932,"""5714dec325ac0d8aee3804ea""","""5714debb25ac0d8aee34d59a""",0.767492,230.009229,"[{4.65,""Gmin7"",0.0}, {7.15,""Bmin7"",4.65}, … {230.0,""Gmaj"",217.45}]","""deezer:67354199""",,
4,"""57:52""","""http://lyrics.wikia.com/A:Bad_…","""Bad idea (x4) <br><br>Ninety-n…","""""",False,"""http://www.allmusic.com/song/m…","""http://musicbrainz.org/recordi…","""Bad Idea""","""1997""","""How Ace Are Buildings""","""[[67354200, ""search-exact""]]""","""67354200""","""GBAAP9700012""","""141""",True,273805,101.0,-9.9,"""http://e-cdn-preview-7.deezer.…","""[""AD"", ""AE"", ""AG"", ""AI"", ""AL"",…","""1998-06-22""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""53ae7abb-e5c8-4097-8c42-3182e2…","""""","""""","""""","""""","""3412""","""6585846""","""http://www.deezer.com/track/67…","""english""","""A""","""Bad Idea""","""[]""","""""",0.403591,"""Alternative Rock""","""False""","""False""","""""","""""",0.339134,"""5714dec325ac0d8aee3804eb""","""5714debb25ac0d8aee34d59a""",0.787385,141.010045,"[{1.75,""Amaj7"",0.0}, {4.55,""Bmin7"",1.75}, … {141.0,""Bmin7"",137.65}]","""deezer:67354200""",,


In [5]:
df.filter(pl.col('_id.$oid') == '60464733c2b2aa03d7e87a7e')

position,lengthAlbum,urlSong,lyrics,urlWikipedia,isClassic,urlAllmusic,urlMusicBrainz,title,publicationDateAlbum,albumTitle,deezer_mapping,id_song_deezer,isrc,length,explicitLyrics,rank,bpm,gain,preview,availableCountries,publicationDate,rdf,urlPandora,urlITunes,urlSpotify,urlYouTube,urlAmazon,urlHypeMachine,urlGoEar,urlLastFm,multitrack_path,multitrack_file,id_song_musicbrainz,disambiguation,language,begin,end,id_artist_deezer,id_album_deezer,urlDeezer,language_detect,name,title_accent_fold,animux_paths,arousal,arousal_predicted,album_genre,has_emotion_tags,has_social_tags,lastfm_id,valence,valence_predicted,_id.$oid,id_album.$oid,chords_metadata.confidence,chords_metadata.duration,chords_metadata.chordSequence,chords_metadata._id,summary,id_album
i64,str,str,str,str,bool,str,str,str,str,str,str,str,str,str,bool,i64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,f64,str,str,f64,f64,list[struct[3]],str,list[str],str


In [None]:
import json
import pandas as pd

with open('./json/topic-models.json', 'rb') as f:
    data = json.load(f)

    df = pd.json_normalize(data)

df.drop(columns=['_id.$oid'], inplace=True)

df.to_csv('./json/topic-models.csv', index=False)

df.to_parquet('./json/topic-models.parquet')
df.head()

Unnamed: 0,topic_id,terms,_id.$oid
0,0,"[never, wish, miss, know, really, ve, have, ha...",60464733c2b2aa03d7e87a7e
1,1,"[feel, fall, same, real, feeling, inside, when...",60464733c2b2aa03d7e87a7f
2,2,"[too, stop, much, talk, house, listen, when, j...",60464733c2b2aa03d7e87a80
3,3,"[man, woman, chorus, lady, verse, u, dem, mi, ...",60464733c2b2aa03d7e87a81
4,4,"[could, world, change, remember, whole, see, w...",60464733c2b2aa03d7e87a82


In [27]:
with open('./json/song-topic.json', 'rb') as f:
    data = json.load(f)

df = pd.json_normalize(data)

df.drop(columns=['_id.$oid', 'song_id.$oid'], inplace=True)

display(df.head())

df.to_csv('./json/song-topic.csv', index=False)
df.to_parquet('./json/song-topic.parquet', compression='snappy')

Unnamed: 0,id_song,topics
0,5714dec325ac0d8aee3804e7,"[{'topic': '0', 'probability': '0.012696832'},..."
1,5714dec325ac0d8aee3804e8,"[{'topic': '0', 'probability': '0.0122550735'}..."
2,5714dec325ac0d8aee3804e9,"[{'topic': '2', 'probability': '0.013548387'},..."
3,5714dec325ac0d8aee3804ea,"[{'topic': '1', 'probability': '0.045793533'},..."
4,5714dec325ac0d8aee3804eb,"[{'topic': '0', 'probability': '0.011736132'},..."


In [28]:
with open('./json/social-tags.json', 'rb') as f:
    data = json.load(f)

df = pd.json_normalize(data)

display(df.head())
df.to_csv('./json/social-tags.csv', index=False)
df.to_parquet('./json/social-tags.parquet')

Unnamed: 0,lastfm_id,socials,_id.$oid,song_id.$oid
0,TRRRRCH128F9342C72,"[{'social_tag': 'classic rock', 'nbr_tags': 10...",6042679fc2b2aa03d7e0e3e3,5714dee825ac0d8aee544a30
1,TRRRRMR128F145852B,"[{'social_tag': 'pop', 'nbr_tags': 100}, {'soc...",6042679fc2b2aa03d7e0e3e4,5714dedf25ac0d8aee4d9285
2,TRRRRFU12903CB3A45,"[{'social_tag': 'blues', 'nbr_tags': 100}, {'s...",6042679fc2b2aa03d7e0e3ea,5714deca25ac0d8aee3d80a7
3,TRRRRQS128F422EC78,"[{'social_tag': 'love', 'nbr_tags': 100}, {'so...",6042679fc2b2aa03d7e0e3ec,5714dec525ac0d8aee39cbff
4,TRRRRMN12903CAA3CB,"[{'social_tag': 'rock', 'nbr_tags': 100}, {'so...",6042679fc2b2aa03d7e0e3ee,5714dee525ac0d8aee5268a6


In [29]:
with open('./json/emotion-tags.json', 'rb') as f:
    data = json.load(f)

df = pd.json_normalize(data)

display(df.head())
df.to_csv('./json/emotion-tags.csv', index=False)
df.to_parquet('./json/emotion-tags.parquet')

Unnamed: 0,lastfm_id,emotions,_id.$oid,song_id.$oid
0,TRRRRCH128F9342C72,"[{'emotion_tag': 'happy', 'nbr_tags': 3}, {'em...",60426661c2b2aa03d7df9e94,5714dee825ac0d8aee544a30
1,TRRRRNA128F42948D2,"[{'emotion_tag': 'party', 'nbr_tags': 0}]",60426661c2b2aa03d7df9e95,5714dec525ac0d8aee39fb93
2,TRRRRYK128F93229FA,"[{'emotion_tag': 'soothing', 'nbr_tags': 4}, {...",60426661c2b2aa03d7df9e97,5714ded725ac0d8aee4751ea
3,TRRRRGT128F4288741,"[{'emotion_tag': 'smooth', 'nbr_tags': 66}, {'...",60426661c2b2aa03d7df9e98,5714deeb25ac0d8aee564d9a
4,TRRRUFD12903CD7092,"[{'emotion_tag': 'poignant', 'nbr_tags': 50}]",60426661c2b2aa03d7df9e99,5714ded225ac0d8aee43ff2d


In [None]:
with open('./json/artist-without-members.json', 'rb') as f:
    data = json.load(f)

df = pd.json_normalize(data)

display(df.head())
df.to_csv('./json/artist-without-members.csv', index=False)
# df.to_parquet('./json/artist-without-members.parquet')

In [None]:
with open('./json/artist-members.json', 'rb') as f:
    data = json.load(f)

df = pd.json_normalize(data)

display(df.head())
df.to_csv('./json/artist-members.csv', index=False)

In [None]:
with open('./json/album.json', 'rb') as f:
    data = json.load(f)

df = pd.json_normalize(data)

display(df.head())
df.to_csv('./json/album.csv', index=False)

In [1]:
import pandas as pd

df_song = pd.read_parquet('./csv/song.parquet')
df_song.head()

Unnamed: 0,position,lengthAlbum,urlSong,lyrics,urlWikipedia,isClassic,urlAllmusic,urlMusicBrainz,title,publicationDateAlbum,...,valence,valence_predicted,_id.$oid,id_album.$oid,chords_metadata.confidence,chords_metadata.duration,chords_metadata.chordSequence,chords_metadata._id,summary,id_album
0,0,57:52,http://lyrics.wikia.com/A:Turn_It_Up,Turn it up<br>I don&apos;t know where you&apos...,,False,http://www.allmusic.com/song/mt0013320473,http://musicbrainz.org/recording/3db608e4-eb72...,Turn It Up,1997,...,,0.657853,5714dec325ac0d8aee3804e7,5714debb25ac0d8aee34d59a,0.746509,93.023107,"[{'end': 16.150000000000002, 'label': 'Amaj', ...",deezer:67354194,,
1,1,57:52,http://lyrics.wikia.com/A:Foghorn,"Sick of you, how old do you think you are?<br>...",,False,http://www.allmusic.com/song/mt0013315202,http://musicbrainz.org/recording/84feea0c-187f...,Foghorn,1997,...,,-0.810233,5714dec325ac0d8aee3804e8,5714debb25ac0d8aee34d59a,0.80717,184.007596,"[{'end': 1.6500000000000001, 'label': 'E7', 's...",deezer:67354196,,
2,2,57:52,http://lyrics.wikia.com/A:Cheeky_Monkey,My name is Jason<br>I&apos;ll rock yer face in...,,False,http://www.allmusic.com/song/mt0013303555,http://musicbrainz.org/recording/f9303efd-b512...,Cheeky Monkey,1997,...,,0.223842,5714dec325ac0d8aee3804e9,5714debb25ac0d8aee34d59a,0.714021,216.007596,"[{'end': 1.55, 'label': 'Emaj', 'start': 0.0},...",deezer:67354198,,
3,3,57:52,http://lyrics.wikia.com/A:No._1,Got to get out more<br>Get in the place<br>Got...,,False,http://www.allmusic.com/song/mt0013319228,http://musicbrainz.org/recording/52ef8537-c61d...,No. 1,1997,...,,-0.016932,5714dec325ac0d8aee3804ea,5714debb25ac0d8aee34d59a,0.767492,230.009229,"[{'end': 4.65, 'label': 'Gmin7', 'start': 0.0}...",deezer:67354199,,
4,4,57:52,http://lyrics.wikia.com/A:Bad_Idea,Bad idea (x4) <br><br>Ninety-nine per cent is ...,,False,http://www.allmusic.com/song/mt0031820543,http://musicbrainz.org/recording/53ae7abb-e5c8...,Bad Idea,1997,...,,0.339134,5714dec325ac0d8aee3804eb,5714debb25ac0d8aee34d59a,0.787385,141.010045,"[{'end': 1.7500000000000002, 'label': 'Amaj7',...",deezer:67354200,,


In [3]:
columns_to_keep = ['_id.$oid',
                   'id_album.$oid',
                   'id_artist_deezer',
                   "title",
                   "publicationDate",
                   "language_detect",
                   "albumTitle",
                   "album_genre",
                   "publicationDateAlbum",
                   "bpm",
                   "gain",
                   'chords_metadata.chordSequence',
                   'chords_metadata.confidence',
                   'chords_metadata.duration',
                   'lyrics',
                   ]
df_song_new = df_song[columns_to_keep].copy()

In [4]:
df_song_new.rename(columns={'_id.$oid': 'song_id',
                            'id_album.$oid': 'album_id',
                            'albumTitle': 'album_title',
                            'album_genre': 'album_genre',
                            "publicationDate": "song_publication_date",
                            'publicationDateAlbum': 'album_publication_date',
                            'bpm': 'bpm',
                            'gain': 'gain',
                            'chords_metadata.chordSequence': 'chord_sequence',
                            'chords_metadata.confidence': 'chord_confidence',
                            'chords_metadata.duration': 'chord_duration',
                            'lyrics': 'lyrics',
                            }, inplace=True)

In [7]:
df_emotion = pd.read_csv('./csv/emotion-tags.csv')
df_emotion.drop(columns=['lastfm_id', '_id.$oid'], inplace=True)
df_emotion.rename(columns={'song_id.$oid': 'song_id'}, inplace=True)
df_song_emotion = pd.merge(df_song_new, df_emotion, on='song_id', how='left')
df_song_emotion

Unnamed: 0,song_id,album_id,id_artist_deezer,title,song_publication_date,language_detect,album_title,album_genre,album_publication_date,bpm,gain,chord_sequence,chord_confidence,chord_duration,lyrics,emotions
0,5714dec325ac0d8aee3804e7,5714debb25ac0d8aee34d59a,3412,Turn It Up,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,77.0,-29.4,"[{'end': 16.150000000000002, 'label': 'Amaj', ...",0.746509,93.023107,Turn it up<br>I don&apos;t know where you&apos...,
1,5714dec325ac0d8aee3804e8,5714debb25ac0d8aee34d59a,3412,Foghorn,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,106.0,-10.6,"[{'end': 1.6500000000000001, 'label': 'E7', 's...",0.807170,184.007596,"Sick of you, how old do you think you are?<br>...",
2,5714dec325ac0d8aee3804e9,5714debb25ac0d8aee34d59a,3412,Cheeky Monkey,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,185.0,-10.4,"[{'end': 1.55, 'label': 'Emaj', 'start': 0.0},...",0.714021,216.007596,My name is Jason<br>I&apos;ll rock yer face in...,
3,5714dec325ac0d8aee3804ea,5714debb25ac0d8aee34d59a,3412,No. 1,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,96.0,-10.6,"[{'end': 4.65, 'label': 'Gmin7', 'start': 0.0}...",0.767492,230.009229,Got to get out more<br>Get in the place<br>Got...,
4,5714dec325ac0d8aee3804eb,5714debb25ac0d8aee34d59a,3412,Bad Idea,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,101.0,-9.9,"[{'end': 1.7500000000000002, 'label': 'Amaj7',...",0.787385,141.010045,Bad idea (x4) <br><br>Ninety-nine per cent is ...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2079505,57c92594e5c453a411c77445,57c92594e5c453a411c7734d,,Your Not Ready,,english,Other Songs,,,,,,,,[Young Buck] (chorus)<br><br>You not ready for...,
2079506,57c92594e5c453a411c77446,57c92594e5c453a411c7734d,,Eminem & 50 Cent:Jimmy Crack Corn,,english,Other Songs,,,,,,,,"[em &amp; 50]Man, lets go.<br>[eminem]hey yo f...",
2079507,57c92594e5c453a411c77447,57c92594e5c453a411c7734d,,Eminem & 50 Cent:The Re-Up,,english,Other Songs,,,,,,,,"[Intro - Eminem] <br><br>[Beatboxing] Yeah, w...",
2079508,57c92594e5c453a411c77448,57c92594e5c453a411c7734d,13,"Eminem, 50 Cent, Ca$his & Lloyd Banks:You Don'...",2006-12-01,english,Other Songs,,,170.0,-6.8,"[{'end': 2.65, 'label': 'Ebmin', 'start': 0.0}...",0.462999,258.011429,[ Intro ]<br>Shady...Yeah..<br><br>Who run it?...,


In [8]:
df_topics = pd.read_parquet('./csv/song-topic.parquet')
df_topics.rename(columns={'id_song': 'song_id'}, inplace=True)
df_song_emotion_topics = pd.merge(df_song_emotion, df_topics, on='song_id', how='left')
df_song_emotion_topics

Unnamed: 0,song_id,album_id,id_artist_deezer,title,song_publication_date,language_detect,album_title,album_genre,album_publication_date,bpm,gain,chord_sequence,chord_confidence,chord_duration,lyrics,emotions,topics
0,5714dec325ac0d8aee3804e7,5714debb25ac0d8aee34d59a,3412,Turn It Up,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,77.0,-29.4,"[{'end': 16.150000000000002, 'label': 'Amaj', ...",0.746509,93.023107,Turn it up<br>I don&apos;t know where you&apos...,,"[{'probability': '0.012696832', 'topic': '0'},..."
1,5714dec325ac0d8aee3804e8,5714debb25ac0d8aee34d59a,3412,Foghorn,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,106.0,-10.6,"[{'end': 1.6500000000000001, 'label': 'E7', 's...",0.807170,184.007596,"Sick of you, how old do you think you are?<br>...",,"[{'probability': '0.0122550735', 'topic': '0'}..."
2,5714dec325ac0d8aee3804e9,5714debb25ac0d8aee34d59a,3412,Cheeky Monkey,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,185.0,-10.4,"[{'end': 1.55, 'label': 'Emaj', 'start': 0.0},...",0.714021,216.007596,My name is Jason<br>I&apos;ll rock yer face in...,,"[{'probability': '0.013548387', 'topic': '2'},..."
3,5714dec325ac0d8aee3804ea,5714debb25ac0d8aee34d59a,3412,No. 1,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,96.0,-10.6,"[{'end': 4.65, 'label': 'Gmin7', 'start': 0.0}...",0.767492,230.009229,Got to get out more<br>Get in the place<br>Got...,,"[{'probability': '0.045793533', 'topic': '1'},..."
4,5714dec325ac0d8aee3804eb,5714debb25ac0d8aee34d59a,3412,Bad Idea,1998-06-22,english,How Ace Are Buildings,Alternative Rock,1997,101.0,-9.9,"[{'end': 1.7500000000000002, 'label': 'Amaj7',...",0.787385,141.010045,Bad idea (x4) <br><br>Ninety-nine per cent is ...,,"[{'probability': '0.011736132', 'topic': '0'},..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2079505,57c92594e5c453a411c77445,57c92594e5c453a411c7734d,,Your Not Ready,,english,Other Songs,,,,,,,,[Young Buck] (chorus)<br><br>You not ready for...,,"[{'probability': '0.018409366', 'topic': '2'},..."
2079506,57c92594e5c453a411c77446,57c92594e5c453a411c7734d,,Eminem & 50 Cent:Jimmy Crack Corn,,english,Other Songs,,,,,,,,"[em &amp; 50]Man, lets go.<br>[eminem]hey yo f...",,
2079507,57c92594e5c453a411c77447,57c92594e5c453a411c7734d,,Eminem & 50 Cent:The Re-Up,,english,Other Songs,,,,,,,,"[Intro - Eminem] <br><br>[Beatboxing] Yeah, w...",,
2079508,57c92594e5c453a411c77448,57c92594e5c453a411c7734d,13,"Eminem, 50 Cent, Ca$his & Lloyd Banks:You Don'...",2006-12-01,english,Other Songs,,,170.0,-6.8,"[{'end': 2.65, 'label': 'Ebmin', 'start': 0.0}...",0.462999,258.011429,[ Intro ]<br>Shady...Yeah..<br><br>Who run it?...,,


In [21]:
df_aritist_without_members = pd.read_csv('./csv/artist-without-members.csv')
df_aritist_without_members = df_aritist_without_members[["_id.$oid", 'id_artist_deezer', 'name', 'locationInfo', "type", "gender"]]
df_aritist_without_members.rename(columns={'_id.$oid': 'artist_id',
                                           'name': 'artist_name',
                                           'locationInfo': "artist_location",
                                           'type': 'artist_type',
                                           'gender': 'artist_gender'}, inplace=True)
df_aritist_without_members.to_csv('./proc/artist.csv', index=False)

  df_aritist_without_members = pd.read_csv('./csv/artist-without-members.csv')


In [24]:
df_aritist_without_members

Unnamed: 0,artist_id,id_artist_deezer,artist_name,artist_location,artist_type,artist_gender
0,56d7e91b6b60c09814f93e4a,3412.0,A,"['England', 'West Yorkshire', 'Leeds']",Group,
1,56d7e91c6b60c09814f93e4c,,A (エース) (ACE),['Japan'],Group,
2,56d7e91d6b60c09814f93e4e,242156.0,A Balladeer,[],Group,
3,56d7e91e6b60c09814f93e50,4708137.0,A Beautiful Silence,"['United States', 'Michigan', 'Marquette']",,
4,56d7e91e6b60c09814f93e52,1006041.0,A Band Called Pain,"['United States', 'California', 'Oakland']",Group,
...,...,...,...,...,...,...
77487,56d997b1cc2ddd0c0f6bf2d2,96732.0,Zara,"['Turkey', 'Marmara Region', 'Istanbul']",Person,
77488,56d997b3cc2ddd0c0f6bf2d3,66370.0,Zebra,['United States'],Group,
77489,56d997b3cc2ddd0c0f6bf2d4,4098622.0,Zebra & Giraffe,"['South Africa', 'Gauteng']",Group,
77490,56d997b3cc2ddd0c0f6bf2d5,,Zearle,"['United States', 'Missouri']",Person,


In [10]:
df_album = pd.read_csv('./csv/album.csv')
df_album = df_album[["_id.$oid", 'title', "id_artist.$oid", 'publicationDate', "length", "genre", "country"]]
df_album.rename(columns={'_id.$oid': 'album_id', 'id_artist.$oid': 'artist_id', 'title': 'title', 'publicationDate': 'publication_date'}, inplace=True)
df_album.to_csv('./proc/album.csv', index=False)

  df_album = pd.read_csv('./csv/album.csv')


In [11]:
df_album

Unnamed: 0,album_id,title,artist_id,publication_date,length,genre,country
0,5714debb25ac0d8aee34d59a,How Ace Are Buildings,56d7e91b6b60c09814f93e4a,1997,57:52,Alternative Rock,GB
1,5714debb25ac0d8aee34d59b,A Vs. Monkey Kong,56d7e91b6b60c09814f93e4a,1999,50:29,Alternative Rock,GB
2,5714debb25ac0d8aee34d59c,Hi-Fi Serious,56d7e91b6b60c09814f93e4a,2002,45:36,Alternative Rock,GB
3,5714debb25ac0d8aee34d59d,Teen Dance Ordinance,56d7e91b6b60c09814f93e4a,2005,47:30,Alternative Rock,GB
4,5714debb25ac0d8aee34d59e,Non-Album Tracks,56d7e91b6b60c09814f93e4a,,,,
...,...,...,...,...,...,...,...
208738,57c92594e5c453a411c77328,5 (Murder By Numbers),57c92593e5c453a411c77256,2012,31:29,Hip Hop,US
208739,57c92594e5c453a411c77333,Animal Ambition,57c92593e5c453a411c77256,2014,39:14,Hip Hop,
208740,57c92594e5c453a411c77342,Street King Immortal,57c92593e5c453a411c77256,2014,,,
208741,57c92594e5c453a411c77348,Songs Featuring 50 Cent,57c92593e5c453a411c77256,,,,


In [12]:
df_song_emotion_topics.columns

Index(['song_id', 'album_id', 'id_artist_deezer', 'title',
       'song_publication_date', 'language_detect', 'album_title',
       'album_genre', 'album_publication_date', 'bpm', 'gain',
       'chord_sequence', 'chord_confidence', 'chord_duration', 'lyrics',
       'emotions', 'topics'],
      dtype='object')

In [None]:
df_song_emotion_topics_album = pd.merge(df_song_emotion_topics, df_album, on='album_id', how='left')

In [19]:
df_song_emotion_topics_album.columns

Index(['song_id', 'album_id', 'id_artist_deezer', 'title_x',
       'song_publication_date', 'language_detect', 'album_title',
       'album_genre', 'album_publication_date', 'bpm', 'gain',
       'chord_sequence', 'chord_confidence', 'chord_duration', 'lyrics',
       'emotions', 'topics', 'title_y', 'artist_id', 'publication_date',
       'length', 'genre', 'country'],
      dtype='object')

In [25]:
df_song_emotion_topics_album_artist = pd.merge(df_song_emotion_topics_album, df_aritist_without_members, on='artist_id', how='left')

In [27]:
df_song_emotion_topics_album_artist.columns

Index(['song_id', 'album_id', 'id_artist_deezer_x', 'title_x',
       'song_publication_date', 'language_detect', 'album_title',
       'album_genre', 'album_publication_date', 'bpm', 'gain',
       'chord_sequence', 'chord_confidence', 'chord_duration', 'lyrics',
       'emotions', 'topics', 'title_y', 'artist_id', 'publication_date',
       'length', 'genre', 'country', 'id_artist_deezer_y', 'artist_name',
       'artist_location', 'artist_type', 'artist_gender'],
      dtype='object')

In [None]:
df_song_emotion_topics_album_artist.drop(columns=['id_artist_deezer_x', 'id_artist_deezer_y'], inplace=True)

In [37]:
df_song_emotion_topics_album_artist.drop(columns=['title_y', 'publication_date', 'genre'], inplace=True)

In [38]:
df_song_emotion_topics_album_artist.columns

Index(['song_id', 'album_id', 'title_x', 'song_publication_date',
       'language_detect', 'album_title', 'album_genre',
       'album_publication_date', 'bpm', 'gain', 'chord_sequence',
       'chord_confidence', 'chord_duration', 'lyrics', 'emotions', 'topics',
       'artist_id', 'length', 'country', 'artist_name', 'artist_location',
       'artist_type', 'artist_gender'],
      dtype='object')

In [39]:
df_song_emotion_topics_album_artist.rename(columns={'title_x': 'song_title',
                                                    'length': 'album_length',
                                                    'country': 'album_country'}, inplace=True)

In [40]:
columns = ['song_id', 'song_title', 'song_publication_date', 'language_detect', 'bpm', 'gain', 'chord_sequence', 'chord_confidence', 'chord_duration', 'lyrics', 'emotions', 'topics',
            'artist_id', 'artist_name', 'artist_location', 'artist_type', 'artist_gender',
            'album_id', 'album_title', 'album_genre','album_publication_date','album_length', 'album_country']
df_song_emotion_topics_album_artist = df_song_emotion_topics_album_artist[columns]
df_song_emotion_topics_album_artist.head()

Unnamed: 0,song_id,song_title,song_publication_date,language_detect,bpm,gain,chord_sequence,chord_confidence,chord_duration,lyrics,...,artist_name,artist_location,artist_type,artist_gender,album_id,album_title,album_genre,album_publication_date,album_length,album_country
0,5714dec325ac0d8aee3804e7,Turn It Up,1998-06-22,english,77.0,-29.4,"[{'end': 16.150000000000002, 'label': 'Amaj', ...",0.746509,93.023107,Turn it up<br>I don&apos;t know where you&apos...,...,A,"['England', 'West Yorkshire', 'Leeds']",Group,,5714debb25ac0d8aee34d59a,How Ace Are Buildings,Alternative Rock,1997,57:52,GB
1,5714dec325ac0d8aee3804e8,Foghorn,1998-06-22,english,106.0,-10.6,"[{'end': 1.6500000000000001, 'label': 'E7', 's...",0.80717,184.007596,"Sick of you, how old do you think you are?<br>...",...,A,"['England', 'West Yorkshire', 'Leeds']",Group,,5714debb25ac0d8aee34d59a,How Ace Are Buildings,Alternative Rock,1997,57:52,GB
2,5714dec325ac0d8aee3804e9,Cheeky Monkey,1998-06-22,english,185.0,-10.4,"[{'end': 1.55, 'label': 'Emaj', 'start': 0.0},...",0.714021,216.007596,My name is Jason<br>I&apos;ll rock yer face in...,...,A,"['England', 'West Yorkshire', 'Leeds']",Group,,5714debb25ac0d8aee34d59a,How Ace Are Buildings,Alternative Rock,1997,57:52,GB
3,5714dec325ac0d8aee3804ea,No. 1,1998-06-22,english,96.0,-10.6,"[{'end': 4.65, 'label': 'Gmin7', 'start': 0.0}...",0.767492,230.009229,Got to get out more<br>Get in the place<br>Got...,...,A,"['England', 'West Yorkshire', 'Leeds']",Group,,5714debb25ac0d8aee34d59a,How Ace Are Buildings,Alternative Rock,1997,57:52,GB
4,5714dec325ac0d8aee3804eb,Bad Idea,1998-06-22,english,101.0,-9.9,"[{'end': 1.7500000000000002, 'label': 'Amaj7',...",0.787385,141.010045,Bad idea (x4) <br><br>Ninety-nine per cent is ...,...,A,"['England', 'West Yorkshire', 'Leeds']",Group,,5714debb25ac0d8aee34d59a,How Ace Are Buildings,Alternative Rock,1997,57:52,GB


In [41]:
df_song_emotion_topics_album_artist.to_csv('./proc/song.csv', index=False)
df_song_emotion_topics_album_artist.to_parquet('./proc/song.parquet')