In [1]:
from huggingface_hub import snapshot_download
import zipfile
import os
import pandas as pd
import pickle

# Download the dataset (if it hasn't been downloaded already)
repo_id = "asigalov61/Annotated-MIDI-Dataset"
repo_type = 'dataset'
local_dir = "./Annotated-MIDI-Dataset"

if not os.path.exists(local_dir):
    snapshot_download(repo_id, repo_type=repo_type, local_dir=local_dir)
else:
    print(f"Dataset already downloaded at {local_dir}")

# Unzip the main archive
zip_file_path = os.path.join(local_dir, "Annotated-MIDI-Dataset-CC-BY-NC-SA.zip")
extract_dir = os.path.join(local_dir, "unzipped_data")

if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Dataset unzipped to {extract_dir}")
else:
    print(f"Dataset already unzipped at {extract_dir}")

  from .autonotebook import tqdm as notebook_tqdm


Dataset already downloaded at ./Annotated-MIDI-Dataset
Dataset unzipped to ./Annotated-MIDI-Dataset/unzipped_data


In [56]:
# Paths (assuming you have these defined from previous steps)
data_dir = os.path.join(extract_dir, "DATA")
midis_dir = os.path.join(extract_dir, "MIDIs")

# --- 1. Load Pickle Files (lyrics_summaries_df and songs_lyrics_df) ---

with open(os.path.join(data_dir, "lyrics_summaries.pickle"), "rb") as f:
    lyrics_summaries_df = pd.DataFrame(pickle.load(f))
    lyrics_summaries_df = lyrics_summaries_df[[0, 5]]
    lyrics_summaries_df.columns = ["song_name", "lyrics_summary"]

with open(os.path.join(data_dir, "songs_lyrics.pickle"), "rb") as f:
    songs_lyrics_df = pd.DataFrame(pickle.load(f))
    songs_lyrics_df = songs_lyrics_df[[0, 2]]
    songs_lyrics_df.columns = ["song_name", "lyrics"]

# --- 2. Create MIDI Files DataFrame with Song Names ---

midi_files = []
for root, dirs, files in os.walk(midis_dir):
    for file in files:
        if file.endswith(".mid"):
            song_name = file[:-4]  # Remove the last 4 characters (".mid") to get the song name
            midi_files.append({
                "midi_file": os.path.join(root, file),
                "song_name": song_name
            })

midi_files_df = pd.DataFrame(midi_files)

# --- 3. Merge DataFrames ---

# Merge lyrics_summaries_df and songs_lyrics_df on 'song_name'
merged_df = pd.merge(lyrics_summaries_df, songs_lyrics_df, on="song_name", how="outer")  # Use outer merge to keep all songs

# Merge the result with midi_files_df on 'song_name'
final_df = pd.merge(merged_df, midi_files_df, on="song_name", how="left")  # Use left merge to keep all songs from merged_df

# --- 4. Filter and Clean ---
# final_df.dropna(subset=["lyrics", "midi_file"], inplace=True) # Example: Drop rows where lyrics or midi_file is missing.

# --- 5. Inspect ---
print(final_df.info())
final_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14250 entries, 0 to 14249
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   song_name       14250 non-null  object
 1   lyrics_summary  14250 non-null  object
 2   lyrics          14250 non-null  object
 3   midi_file       14250 non-null  object
dtypes: object(4)
memory usage: 445.4+ KB
None


Unnamed: 0,song_name,lyrics_summary,lyrics,midi_file
0,('Till) I Kissed You --- The Everly Brothers,The speaker discovered an intense longing and ...,Never felt like this until I kissed you \r\nHo...,./Annotated-MIDI-Dataset/unzipped_data/MIDIs/(...
1,(Theme From) The Monkees --- The Monkees,The Monkees sing about their carefree and adve...,Here we come\r\nWalkin' down the street\r\nWe ...,./Annotated-MIDI-Dataset/unzipped_data/MIDIs/(...
2,(What a) Wonderful World --- Sam Cooke,Despite lacking knowledge in various academic ...,Don't know much about history\r\nDon't know mu...,./Annotated-MIDI-Dataset/unzipped_data/MIDIs/(...
3,(You Make Me Feel Like a) Natural Woman --- Ar...,The singer expresses feeling uninspired and lo...,Looking out on the morning rain\r\nI used to f...,./Annotated-MIDI-Dataset/unzipped_data/MIDIs/(...
4,(You Make Me Feel Like a) Natural Woman.1 --- ...,The lyrics describe how the speaker felt unins...,Looking out on the morning rain\r\nI used to f...,./Annotated-MIDI-Dataset/unzipped_data/MIDIs/(...
