## Import

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
import pytorch_lightning as pl


from pitchclass2vec import encoding, model
from pitchclass2vec.pitchclass2vec import Pitchclass2VecModel

from tasks.segmentation.data import BillboardDataset, SegmentationDataModule
from tasks.segmentation.functional import LSTMBaselineModel

import pitchclass2vec.model as model
import pitchclass2vec.encoding as encoding
from pitchclass2vec.data import ChocoDataModule

from evaluate import load_pitchclass2vec_model

RANDOM_SEED = 42
pl.seed_everything(seed=RANDOM_SEED)
print("done")

Global seed set to 42


done


## Train Embedding Model

#### Use root-interval as encoding method, fasttext as embedding model

In [2]:
# Config the embedding model train process
train_args = {
    'choco': "/app/choco_dataset/v1.0.0/", # path for Choco Dataset
    'out': "/app/out", # path for output embedding model
    'encoding': "root-interval", # path for encoder
    'model': "fasttext", # path for the definition of embedding model
    
    'batch_size': 512,
    'context': 5,
    'negative_sampling_k': 20,
    'embedding_dim': 100,
    'seed': 42,
    'max_epochs': 10,
    'early_stop_patience': -1, # If there's no significant change on loss, then keep trainning for 2 more epochs.
    
    'wandb_run_name': "first_run_with_whole_ChocoDataSet"

}

# Auto generate a Linux command
command_parts = ["python /app/train.py"]
for arg, value in train_args.items():
    command_parts.append(f"--{arg} {value}")

command = " ".join(command_parts)
print(command)

print("done!")


python /app/train.py --choco /app/choco_dataset/v1.0.0/ --out /app/out --encoding root-interval --model fasttext --batch_size 512 --context 5 --negative_sampling_k 20 --embedding_dim 100 --seed 42 --max_epochs 10 --early_stop_patience -1 --wandb_run_name first_run_with_whole_ChocoDataSet
done!


In [3]:
# Run the Linux command
!{command}
print("done")

Global seed set to 42
[34m[1mwandb[0m: Currently logged in as: [33mcretaceousmart[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.12
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/wandb/run-20231102_175554-ifca5kqb[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfirst_run_with_whole_ChocoDataSet[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/cretaceousmart/pitchclass2vec[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/cretaceousmart/pitchclass2vec/runs/ifca5kqb[0m
Jie Log: data_path: /app/choco_dataset/v1.0.0/jams
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    

# Segmentation baseline

In [2]:
EXP = [
    #("text", "fasttext", "out/fasttext_best/model.ckpt"),
    # ("timed-root-interval", "emb-weighted-fasttext", "/app/out/rootinterval_best/model.ckpt"),
    #("rdf", "randomwalk-rdf2vec", "out/rdf2vec_best/model.ckpt"),
    ("root-interval", "fasttext", "/app/out/first_run_with_whole_ChocoDataSet.ckpt"),
]

  
experiments_df = pd.DataFrame(columns=[
    "encoding", "model", "path", "test_p_precision", "test_p_recall",  "test_p_f1",  "test_under",  "test_over",  "test_under_over_f1"
])
print("done")

done


In [3]:
import logging
import wandb
from pathlib import Path
logging.disable(logging.CRITICAL)

segmentation_train_args = {
    "test_mode" : False, # If test_mode = true, then we use 3 track for test
    # "segmentation_out" : "/app/segmentation_out",
    "wandb_run_name" : "first_run.ckpt",
    "disable_wandb" : False,
    "num_labels" : 11,
    "embedding_dim" : None,  # Default as None，will use p2v.vector_size
    "hidden_size" : 256,
    "num_layers" : 5,
    "dropout" : 0.2,
    "learning_rate" : 0.001,
}


out = "/app/segmentation_out"
file_name = "first_segmentation_model"

for exp in tqdm(EXP):    
    p2v = load_pitchclass2vec_model(*exp)
    data = SegmentationDataModule(  dataset_cls=BillboardDataset, 
                                    pitchclass2vec=p2v, 
                                    batch_size = 256, 
                                    test_mode = segmentation_train_args.get("test_mode", True)
                                    )
      
    # lstm_model = LSTMBaselineModel(embedding_dim=p2v.vector_size, hidden_size=256, num_layers=5, dropout=0.2, learning_rate=0.001)
    # TODO: figure out whether num_labels should be 8 or 11 (paper said 11 that could be correct, because 8 is not something from Billboard dataset but ChoCo dataset)
    
    lstm_model = LSTMBaselineModel(
        num_labels=segmentation_train_args["num_labels"],
        embedding_dim=p2v.vector_size,
        hidden_size=segmentation_train_args["hidden_size"],
        num_layers=segmentation_train_args["num_layers"],
        dropout=segmentation_train_args["dropout"],
        learning_rate=segmentation_train_args["learning_rate"]
    )

    
    if not segmentation_train_args.get("disable_wandb", False):

        wandb.init(
            # Set the project where this run will be logged
            project="pitchclass2vec_Segmentation", 
            name=f"{ segmentation_train_args.get('wandb_run_name', 'None') }",
            
            # # Track hyperparameters and run metadata
            config={
                # Add any other parameters you want to track
                "num_labels": segmentation_train_args["num_labels"],
                "embedding_dim": segmentation_train_args["embedding_dim"] or p2v.vector_size,
                "hidden_size": segmentation_train_args["hidden_size"],
                "num_layers": segmentation_train_args["num_layers"],
                "dropout": segmentation_train_args["dropout"],
                "learning_rate": segmentation_train_args["learning_rate"],
            }
        )
        wandb.watch(lstm_model)

    file_name = f"{segmentation_train_args.get('wandb_run_name')}"
    callbacks = [
        pl.callbacks.ModelCheckpoint(save_top_k=1,
                                    monitor="train/loss",
                                    mode="min",
                                    dirpath=out,
                                    filename=file_name,
                                    every_n_epochs=1)
    ] 
    
    trainer = pl.Trainer(max_epochs=150, 
                         accelerator="auto", 
                         devices=1,
                         enable_progress_bar=False,
                         callbacks=callbacks)
    
    trainer.fit(lstm_model, data)

    # wandb.save(str(Path(segmentation_train_args.get("segmentation_out")) / f"{segmentation_train_args.get('wandb_run_name')}.ckpt"))
    wandb.save(str(Path(out) / f"{file_name}.ckpt"))

    test_metrics = trainer.test(lstm_model, data)
    # Use pd.concat instead of pd.append
    new_row_df = pd.DataFrame([{
        "encoding": exp[0], "model": exp[1], "path": exp[2], **test_metrics[0]
    }])
    experiments_df = pd.concat([experiments_df, new_row_df], ignore_index=True)
    print("done")

  0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mcretaceousmart[0m. Use [1m`wandb login --relogin`[0m to force relogin




Track 974 not parsable


100%|██████████| 890/890 [00:01<00:00, 780.77it/s]


ValueError: prefetch_factor option could only be specified in multiprocessing.let num_workers > 0 to enable multiprocessing.

In [4]:
experiments_df

Unnamed: 0,encoding,model,path,test_p_precision,test_p_recall,test_p_f1,test_under,test_over,test_under_over_f1,train_loss
0,root-interval,fasttext,/app/out/first_run_with_whole_ChocoDataSet.ckpt,0.424819,0.575204,0.488704,0.473333,0.637517,0.543292,1.457962
