	# Author: Alexander Staub
	## Last changed: 2025.02.20
	## Purpose: cutting down the large json with 11 million songs into a dataset to check the overlap with acousticbrainz with

# Purpose

- remove duplicate recording_id level information
- remove songs that don't have an associated record label
- remove entries that are not really songs

In [17]:
# load necessary packages
#installing packages
import pandas as pd
from datetime import datetime
import json
import re  # Import the regular expression module

In [5]:
# load in the json from Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports named release_recordings_unique_80_10_complied.json

file_path_mb_songs = 'Z:/Data_alexander/data/raw_data/musicbrainz/sql_exports/release_recordings_unique_80_10_complied.json'
file_path_mb_albums = 'Z:/Data_alexander/data/raw_data/musicbrainz/sql_exports/release_country_labels_80_10.csv'

mb_songs = pd.read_csv(file_path_mb_songs)
mb_releases = pd.read_csv(file_path_mb_albums)

In [6]:
#print heads of both dataframes
print(mb_songs.head())
print(mb_releases.head())

   id_release name_medium_format  id_track  \
0     1275064                 CD        41   
1     1275064                 CD        42   
2     1275064                 CD        43   
3     1275064                 CD        44   
4     1275064                 CD        45   

                             mbid_track  id_recording  \
0  8d13b868-aada-3aae-a31c-bc305528de52            19   
1  6ba592c0-3788-3d6d-b900-ab5d64374054            18   
2  87cc2257-592a-3f54-ad0f-61380cb9b346            15   
3  7b6cd398-9867-3b70-9be8-8ee49f36bbb8            14   
4  8dc6d377-8f15-32d6-8545-1c13f33bbd40            11   

                         mbid_recording                   name_recording  \
0  823de184-19a6-4420-80b4-265afa81999c                   Safe From Harm   
1  032c4ce0-b1fd-442d-8bf1-b7777e4832e7                         One Love   
2  bef81f8f-4bcf-4308-bd66-e57018169a94                       Blue Lines   
3  44bda0c6-7d93-4000-9b7f-b2b5a4025ad4  Be Thankful for What You’ve Got   


In [8]:
#create a a summary of counts of the column name_release_g_type
print(mb_releases['name_release_g_type'].value_counts())

name_release_g_type
Album        741201
Single       164270
EP            75771
Other         12343
Broadcast      1030
Name: count, dtype: int64


In [12]:
#create seperate dataframes of Broadcasts and Other type
mb_broadcasts = mb_releases[mb_releases['name_release_g_type'] == 'Broadcast']

mb_other = mb_releases[mb_releases['name_release_g_type'] == 'Other']

In [None]:
# Convert the "name_release" column to a string and print it
name_release_str = mb_other['name_release'].to_string(index=False)

# Print the string
print(name_release_str)

In [9]:
#create a summary of counts of the column name_label 
print(mb_releases['name_label'].value_counts())

name_label
[no label]            25070
Columbia              14034
Polydor                9049
EMI                    8372
Virgin                 7992
                      ...  
EsionGod                  1
Mediarock                 1
Qasida                    1
Empty Boat Records        1
Sambuca Records           1
Name: count, Length: 83137, dtype: int64


In [16]:
# remove all observations where the name_label == [no label]
mb_releases_filtered = mb_releases[mb_releases['name_label'] != '[no label]']

In [18]:
#remove all observations where the name_release_g_type == Broadcast
mb_releases_filtered = mb_releases_filtered[mb_releases_filtered['name_release_g_type'] != 'Broadcast']

In [28]:
#recategorize all observations where the name_release_g_type == Other to Album and matches the expressions below
def identify_and_recategorize_compilations(df):
    """
    Identifies potential compilation/greatest hits releases in a DataFrame
    and recategorizes them as "album".

    Args:
        df: pandas DataFrame containing MusicBrainz release data.  Must have
            columns named 'name_release' and 'name_release_g_type'.

    Returns:
        A tuple containing:
          1. A new pandas DataFrame with the identified releases recategorized.
          2. A list of the release names that were recategorized.
    """

    df_copy = mb_releases_filtered.copy()
    compilation_names = []  # Initialize an empty list to store identified names

    # --- Identification Logic (Regular Expressions) ---
    patterns = [
        r".*(greatest hits|best of|collection|anthology|years|sounds of|classics).*",  # Common keywords
        r".*(compilation|sampler).*", #compilation and sampler
        r".*(\bvol\b|\bvolume\b).*\d+.*", # Volumes
        r".*\d{4}[s]?\b.*", # Decades
        r".*(the essential|ultimate|definitive).*",
        r".*(gold|platinum|diamond|anniversary).*",
        r".*edition.*",
        r".*(\bHits\b).*"
    ]

    for index, row in df_copy.iterrows():
      name_release = row['name_release']
      if isinstance(name_release, str):  # Make sure the name_release is a string.
        name_release_lower = name_release.lower()
        for pattern in patterns:
            if re.match(pattern, name_release_lower):
                if (
                    "demo" not in name_release_lower and
                    "interview" not in name_release_lower and
                    #"live" not in name_release_lower and
                    "session" not in name_release_lower and
                    "promo" not in name_release_lower and
                    "soundtrack" not in name_release_lower and  # Exclude soundtracks of the *score*
                    "sampler" not in name_release_lower and
                    ("soundtrack" not in name_release_lower or (("various" in name_release_lower or "artists" in name_release_lower) and "soundtrack" in name_release_lower))
                ):
                    compilation_names.append(name_release)
                    break #once matched, no need to check other paterns

    # --- Recategorization (using .loc, as before) ---
    df_copy.loc[df_copy['name_release'].isin(compilation_names), 'name_release_g_type'] = 'Album'

    return df_copy, compilation_names

In [29]:
#Call the function to identify and recategorize
mb_releases_recategorized, identified_compilations = identify_and_recategorize_compilations(mb_releases_filtered)

In [30]:
#Verification, to check how many rows were changes, and to quickly visualize
#the changes
print("\nVerification")
print(mb_releases_recategorized[mb_releases_recategorized["name_release_g_type"]=="Album"][["name_release", "name_release_g_type"]])
print(f"\n Number of rows changed:  {(mb_releases_recategorized['name_release_g_type'] == 'Album').sum()}")
print(f"\n Number of compilations identified: {len(identified_compilations)}")


Verification
                  name_release name_release_g_type
0              Maximum Minisex               Album
1                    Bradycard               Album
2           Im Namen der Liebe               Album
3           Who Dominates Who?               Album
4                Liagn & lochn               Album
...                        ...                 ...
1009562  Southern Rock Masters               Album
1009563                  Actor               Album
1009564       Burning Memories               Album
1009565                  Karma               Album
1009566           Appassionato               Album

[733503 rows x 2 columns]

 Number of rows changed:  733503

 Number of compilations identified: 103901


In [32]:
#create a summary of counts of the column name_label 
print(mb_releases_recategorized['name_release_g_type'].value_counts())
print(mb_releases_filtered['name_release_g_type'].value_counts())

name_release_g_type
Album     733503
Single    159878
EP         67009
Other      10507
Name: count, dtype: int64
name_release_g_type
Album     725686
Single    162656
EP         69473
Other      11432
Name: count, dtype: int64


In [33]:
#remove all observations where the name_release_g_type == Other
mb_releases_recategorized = mb_releases_recategorized[mb_releases_recategorized['name_release_g_type'] != 'Other']

In [34]:
# create a random sample of 500 rows as a new dataframe named "random_sample_releases" from the mb_releases_recategorized dataframe
random_sample_releases = mb_releases_recategorized.sample(n=500)

# Removal of duplicate recording ids and the releases that were removed from the songs dataframe

In [36]:
#remove all duplicates of the recording_id column in the mb_songs dataframe
mb_songs_unique_id = mb_songs.drop_duplicates(subset='id_recording')

In [37]:
#remove all rows that don't have the release ids of the mb_releases_recategorized dataframe in the mb_songs_unique_id dataframe
# Get the list of unique id_release values from mb_releases_recategorized
valid_id_releases = mb_releases_recategorized['id_release'].unique()

# Filter mb_songs_unique_id to keep only rows with id_release values present in valid_id_releases
mb_songs_unique_id = mb_songs_unique_id[mb_songs_unique_id['id_release'].isin(valid_id_releases)]


In [38]:
# save both mb_songs_unique_id and mb_releases_recategorized as csv files in 
# the Z:\Data_alexander\data\raw_data\musicbrainz\sql_exports\musicbrainz_data_refined folder
mb_songs_unique_id.to_csv('Z:/Data_alexander/data/raw_data/musicbrainz/sql_exports/musicbrainz_data_refined/release_recordings_unique_80_10_refined.csv', index=False)
mb_releases_recategorized.to_csv('Z:/Data_alexander/data/raw_data/musicbrainz/sql_exports/musicbrainz_data_refined/release_country_labels_80_10_refined.csv', index=False)