In [None]:
'''
author: Alexander Staub
Date: 2024-09-26
Description: Script to create a random sample of albums for Alessio to use to test the new Spotify code 
meant to scrap spotify audio charactersitics

'''

In [1]:
#install necessary packages

#library loading

import os
import pandas as pd
import json # to read json files


In [2]:
#set seed for reproducibility
import random
random.seed(42)

In [3]:
#set working director to music_data
# Get the current working directory
current_dir = os.getcwd()

#move working directory two levels down to "music_data" using os package
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))

In [4]:
#load the data sets of relevance

# Load the album data
file_path = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_country_labels_80_10_w_artists.csv"
albums = pd.read_csv(file_path)


In [5]:
# Display the first few rows of the dataframe
albums.head()

Unnamed: 0.1,Unnamed: 0,id_release,date_year,date_month,date_day,id_country,name_country,name_label,name_label_type,mbid_release,name_release,name_release_g_type,id_artist_credit,artist_credit_name
0,1,1968996,2010,,,14,Austria,monkey.,Publisher,716787a0-b8e0-43fd-a056-c228015c1b75,Maximum Minisex,Album,245346,Minisex
1,2,1427936,2005,,,14,Austria,Non Visual Objects,Original Production,69020faf-0f54-463b-adc7-852ebb1b693c,Bradycard,Album,466986,Heribert Friedl
2,3,68494,1994,9.0,19.0,14,Austria,KOCH International,Production,fa29306e-b128-4e19-a9ac-f0256696a688,Im Namen der Liebe,Album,49605,Brunner & Brunner
3,4,97102,1997,,,14,Austria,KOCH International,Production,587ae7f9-7432-3aec-bc3c-4c9b0f474a63,Who Dominates Who?,Album,210470,Accu§er
4,5,116365,1989,,,14,Austria,Amadeo,Original Production,89c82c55-11ad-43a0-b67a-58d882b4654f,Liagn & lochn,Album,56009,Ostbahn Kurti & Die Chefpartie


In [8]:
# Load the recordings data
file_path_csv_1 = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_recordings_unique_80_10_part_1.csv"
file_path_csv_2 = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_recordings_unique_80_10_part_2.csv"
file_path_csv_3 = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_recordings_unique_80_10_part_3.csv"
file_path_csv_4 = r"Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\release_recordings_unique_80_10_part_4.csv"


recordings_1 = pd.read_csv(file_path_csv_1)
recordings_2 = pd.read_csv(file_path_csv_2)
recordings_3 = pd.read_csv(file_path_csv_3)
recordings_4 = pd.read_csv(file_path_csv_4)


In [9]:
#append the 4 seperate recordings to one single recordings table
recordings = pd.concat([recordings_1, recordings_2, recordings_3, recordings_4], ignore_index=True)

In [10]:
# Display the first few rows of the dataframe
recordings.head()

Unnamed: 0,id_release,name_medium_format,id_track,mbid_track,id_recording,mbid_recording,name_recording,id_artist_credit,name_artist_credit
0,1275064,CD,41,8d13b868-aada-3aae-a31c-bc305528de52,19,823de184-19a6-4420-80b4-265afa81999c,Safe From Harm,4,Massive Attack
1,1275064,CD,42,6ba592c0-3788-3d6d-b900-ab5d64374054,18,032c4ce0-b1fd-442d-8bf1-b7777e4832e7,One Love,4,Massive Attack
2,1275064,CD,43,87cc2257-592a-3f54-ad0f-61380cb9b346,15,bef81f8f-4bcf-4308-bd66-e57018169a94,Blue Lines,4,Massive Attack
3,1275064,CD,44,7b6cd398-9867-3b70-9be8-8ee49f36bbb8,14,44bda0c6-7d93-4000-9b7f-b2b5a4025ad4,Be Thankful for What You’ve Got,4,Massive Attack
4,1275064,CD,45,8dc6d377-8f15-32d6-8545-1c13f33bbd40,11,b30b9943-9100-4d84-9ad2-69859ea88fbb,Five Man Army,4,Massive Attack


In [11]:
#create the 300 random sample of albums
# Randomly select 300 lines from the albums dataframe
albums_300_sample = albums.sample(n=300)

# Display the first few rows of the sampled dataframe
albums_300_sample.head()


Unnamed: 0.1,Unnamed: 0,id_release,date_year,date_month,date_day,id_country,name_country,name_label,name_label_type,mbid_release,name_release,name_release_g_type,id_artist_credit,artist_credit_name
466492,466493,604962,2007,4.0,10.0,222,United States,KOCH Records,Original Production,8c76f144-4ff6-41ce-ac8a-9131ca0c72c2,Angels of Shanghai,Album,3222,Bob James
593437,593438,1077739,1998,6.0,9.0,222,United States,Pablo,Original Production,2f9c7586-9db9-4714-a3d0-73f5b00c7872,At the Kosei Nenkin,Album,47596,Milt Jackson
145487,145488,2080443,1995,11.0,20.0,221,United Kingdom,Capitol Records,Imprint,a614660e-ffc5-4996-a582-9d2d9e1c6b77,For All the Cows,Single,176,Foo Fighters
100591,100592,405452,2001,,,81,Germany,Royal Bunker,Original Production,28e3deb8-9b7c-466f-9f0c-7af692d819e5,Neue Wahrheit,Album,976541,Justus
736705,736706,3312104,2005,11.0,7.0,105,Italy,[no label],Production,0cec7dd5-6fc3-4af5-8baa-cf236946e2e9,City Streets,Album,651315,Unhindered


## Recreating the old sample as the seed doesn't seem to have worked
First I need to load in the old sample created
Then I need to deduplicate to the album level and remove the columns 'name_recording' and 'name_artist_credit_recording'

In [17]:
#load in the previous sample of albums
filepath_old_sample = r"Z:\Data_alexander\data\interim_data\random_300_album_recording_samples\24_09\albums_300_recordings_matched_rand_full.csv"

albums_old_sample = pd.read_csv(filepath_old_sample)

albums_old_sample.head()

Unnamed: 0.1,Unnamed: 0,id_release,date_year,date_month,date_day,id_country,name_country,name_label,name_label_type,mbid_release,name_release,name_release_g_type,id_artist_credit,artist_credit_name,name_recording,name_artist_credit_recording
0,336951,9008,2001,,,222,United States,Beta-lactam Ring Records,Production,02a71e5d-0967-3e35-a271-b21f82a10d60,Kollabaris,Album,38426,The Legendary Pink Dots,A Small Swan Song,Karolinek
1,336951,9008,2001,,,222,United States,Beta-lactam Ring Records,Production,02a71e5d-0967-3e35-a271-b21f82a10d60,Kollabaris,Album,38426,The Legendary Pink Dots,Bring the Rain,Artwork
2,336951,9008,2001,,,222,United States,Beta-lactam Ring Records,Production,02a71e5d-0967-3e35-a271-b21f82a10d60,Kollabaris,Album,38426,The Legendary Pink Dots,Bump (Ver 2),The Tear Garden
3,336951,9008,2001,,,222,United States,Beta-lactam Ring Records,Production,02a71e5d-0967-3e35-a271-b21f82a10d60,Kollabaris,Album,38426,The Legendary Pink Dots,Catchy Tune,Mimir
4,336951,9008,2001,,,222,United States,Beta-lactam Ring Records,Production,02a71e5d-0967-3e35-a271-b21f82a10d60,Kollabaris,Album,38426,The Legendary Pink Dots,Message 3,The Tear Garden


In [19]:
#create a new table called albums_300_sample that only contains unique values of "id_release" and that does not have the columns "name_recording" and "name_artist_credit_recording"

albums_300_sample = albums_old_sample.drop(columns=["name_recording", "name_artist_credit_recording"])

albums_300_sample = albums_300_sample.drop_duplicates(subset=["id_release"])

albums_300_sample.head()

Unnamed: 0.1,Unnamed: 0,id_release,date_year,date_month,date_day,id_country,name_country,name_label,name_label_type,mbid_release,name_release,name_release_g_type,id_artist_credit,artist_credit_name
0,336951,9008,2001,,,222,United States,Beta-lactam Ring Records,Production,02a71e5d-0967-3e35-a271-b21f82a10d60,Kollabaris,Album,38426,The Legendary Pink Dots
8,1005896,426258,2000,4.0,11.0,222,United States,Shanachie,Original Production,490e090a-dd5a-4ac6-8b92-4d793d2dea91,Little Wooden Box,Album,8624,Wayne Toups & Zydecajun
18,451981,653065,1982,4.0,,105,Italy,Warner Bros. Records,Imprint,ebb0706c-a1b7-33c3-8a48-f79626d59c0b,Big Science,Album,10298,Laurie Anderson
24,519182,157383,2000,2.0,9.0,202,Sweden,EVA Records,Production,f47c615f-ca58-442c-bf87-ec81ce5a2733,Absolute Love 2000,Album,1,Various Artists
36,945546,2765872,1996,,,222,United States,Relativity Records,Original Production,80976978-0f90-4ae0-b743-88404e6c7771,The Bitch in Yoo / The Real Weight,Single,2761957,


In [None]:
#match the recordings information to the 300_albums information based on the "release_ID" column

# Select only the necessary columns from recordings and rename the column
recordings_subset = recordings[['id_release', 'name_recording', 'name_artist_credit']]
recordings_subset = recordings_subset.rename(columns={'name_artist_credit': 'name_artist_credit_recording'})

# Merge the dataframes on the ID_release column
albums_300_recordings_matched = pd.merge(albums_300_sample, recordings_subset, on='id_release', how='left')

# Display the first few rows of the matched dataframe
albums_300_recordings_matched.head()

## Adding in the subset of albums Bollicine

get all the albums with the id_rlease = 2536476; 2739435; 29801
merge in all the songs associated with the id_release = 2536476; 2739435; 29801
concatenate the table to the 300_sample

In [21]:
#add the albums matching the id_release 2536476; 2739435; 29801 from the "albums" table

# Select the rows from the albums dataframe that have the ID_release 2536476, 2739435, 29801

albums_bollicine = albums[albums['id_release'].isin([2536476, 2739435, 29801])]

In [22]:
#merge in the songs related to the albums_bollicine
albums_bollicine_recordings_matched = pd.merge(albums_bollicine, recordings_subset, on='id_release', how='left')

In [23]:
#concatenate the albums_bollicine_recordings_matched to the albums_300_recordings_matched
albums_300_recordings_matched = pd.concat([albums_300_recordings_matched, albums_bollicine_recordings_matched], ignore_index=True)

In [24]:
# remove duplicates from the dataframe
# Concatenate the columns "name_release" and "name_recording" to identify duplicates
albums_300_recordings_matched['combined'] = albums_300_recordings_matched['name_release'] + albums_300_recordings_matched['name_recording']

# Remove duplicates based on the concatenated column
albums_300_recordings_matched_clean = albums_300_recordings_matched.drop_duplicates(subset='combined')

# Drop the temporary concatenated column
albums_300_recordings_matched_clean = albums_300_recordings_matched_clean.drop(columns=['combined'])




In [25]:
# Assuming albums_300_recordings_matched_clean is already defined
# Create the first dataframe with 100 random lines
albums_300_recordings_matched_rand_100 = albums_300_recordings_matched_clean.sample(n=100, random_state=42)

# Create the second dataframe with 500 random lines
albums_300_recordings_matched_rand_500 = albums_300_recordings_matched_clean.sample(n=500, random_state=42)

# Create the third dataframe with 1500 random lines
albums_300_recordings_matched_rand_1500 = albums_300_recordings_matched_clean.sample(n=1500, random_state=42)


In [27]:
#save the random samples
file_path_1500 = r"Z:\Data_alexander\data\interim_data\random_300_album_recording_samples\24_10\albums_300_recordings_matched_rand_1500.csv"
file_path_100 = r"Z:\Data_alexander\data\interim_data\random_300_album_recording_samples\24_10\albums_300_recordings_matched_rand_100.csv"
file_path_500 = r"Z:\Data_alexander\data\interim_data\random_300_album_recording_samples\24_10\albums_300_recordings_matched_rand_500.csv"
file_path_full = r"Z:\Data_alexander\data\interim_data\random_300_album_recording_samples\24_10\albums_300_recordings_matched_rand_full.csv"

# Save the dataframes as CSV files
albums_300_recordings_matched_rand_100.to_csv(file_path_100, index=False)
albums_300_recordings_matched_rand_500.to_csv(file_path_500, index=False)
albums_300_recordings_matched_rand_1500.to_csv(file_path_1500, index=False)
albums_300_recordings_matched_clean.to_csv(file_path_full, index=False)