In [None]:
'''
author: Alexander Staub
Date: 2024-11-07e
Description: Script to create a random sample of 4500 albums (300 per country) for Alessio to use to test the new Spotify code 
meant to scrape spotify audio charactersitics

'''

In [1]:
#install necessary packages

#library loading

import os
import pandas as pd
import json # to read json files

In [2]:
#set seed for reproducibility
import random
random.seed(420)

In [3]:
#load the data sets of relevance

# Load the album data
file_path = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_country_labels_80_10_w_artists.csv"
albums = pd.read_csv(file_path)

print(albums.head())

   Unnamed: 0  id_release  date_year  date_month  date_day  id_country  \
0           1     1968996       2010         NaN       NaN          14   
1           2     1427936       2005         NaN       NaN          14   
2           3       68494       1994         9.0      19.0          14   
3           4       97102       1997         NaN       NaN          14   
4           5      116365       1989         NaN       NaN          14   

  name_country          name_label      name_label_type  \
0      Austria             monkey.            Publisher   
1      Austria  Non Visual Objects  Original Production   
2      Austria  KOCH International           Production   
3      Austria  KOCH International           Production   
4      Austria              Amadeo  Original Production   

                           mbid_release        name_release  \
0  716787a0-b8e0-43fd-a056-c228015c1b75     Maximum Minisex   
1  69020faf-0f54-463b-adc7-852ebb1b693c           Bradycard   
2  fa29306e

In [4]:
# Load the recordings data with the recording IDs to merge to the albums

file_path_csv_1 = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_recordings_unique_80_10_part_1.csv"
file_path_csv_2 = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_recordings_unique_80_10_part_2.csv"
file_path_csv_3 = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_recordings_unique_80_10_part_3.csv"
file_path_csv_4 = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_recordings_unique_80_10_part_4.csv"


recordings_1 = pd.read_csv(file_path_csv_1)
recordings_2 = pd.read_csv(file_path_csv_2)
recordings_3 = pd.read_csv(file_path_csv_3)
recordings_4 = pd.read_csv(file_path_csv_4)

In [5]:
#append the 4 seperate recordings to one single recordings table
recordings = pd.concat([recordings_1, recordings_2, recordings_3, recordings_4], ignore_index=True)

In [6]:
# take a random sample of 300 albums per distinct country in the "name_country" column of the albums dataset

# get the unique countries
countries = albums["name_country"].unique()

# create a dictionary to store the samples
samples = {}

# loop over the countries and get a random sample of 300 albums per country
for country in countries:
    sample = albums[albums["name_country"] == country].sample(300)
    samples[country] = sample

# concatenate the samples to one single dataframe
sample = pd.concat(samples.values(), ignore_index=True)

In [8]:
#initial clean up of the albums

# remove "other" under the release_g_type column
sample = sample[sample["name_release_g_type"] != "Other"]



#Creating the song level dataset

Need to match the recording IDs to the release_id 

In [None]:
#match the recordings information to the 5100 albums information based on the "release_ID" column

# Select only the necessary columns from recordings and rename the column
recordings_subset = recordings[['id_release', 'name_recording', 'name_artist_credit']]
recordings_subset = recordings_subset.rename(columns={'name_artist_credit': 'name_artist_credit_recording'})

# Merge the dataframes on the ID_release column
sample_songs = pd.merge(sample, recordings_subset, on='id_release', how='left')

# remove the columns "name_artist_credit"
sample_songs = sample_songs.drop(columns=["artist_credit_name"])

# Display the first few rows of the matched dataframe
sample_songs.head()

In [13]:
#cleaning up the "albums" data

# remove duplicates of "release_id" in the "sample" dataset
sample = sample.drop_duplicates(subset="id_release")

# remove "various artists" and missing values under the artist_credit_name column from the sample
sample = sample[sample["artist_credit_name"] != "Various Artists"]
sample = sample.dropna(subset=["artist_credit_name"])

# remove "other" and "missing values" under the release_g_type column
sample = sample.dropna(subset=["name_release_g_type"])

#rename the dataset to sample_albums to distinguish from prior song level dataset
sample_albums = sample

In [14]:
# remove duplicates from the songs dataframe

# Concatenate the columns "name_release" and "name_recording" to identify duplicates
sample_songs['combined'] = sample_songs['name_release'] + sample_songs['name_recording']

# Remove duplicates based on the concatenated column
sample_songs_clean = sample_songs.drop_duplicates(subset='combined')

# Drop the temporary concatenated column
sample_songs_clean = sample_songs.drop(columns=['combined'])

In [None]:
# saving files

#create the file paths


# save the "recordings" file as a json called "release_recordings_unique_80_10_complied.json" in the "sql_exports" folder


# save the "albums" file as a csv

# save the "songs" file as a csv

