In [None]:
PROJECT_NAME = 'spotify_mpc'

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.extend([
    "../../",
    "../../execution",
    "../../orchestration",
])

import os
from orchestration.spotify_mpc import orchestrate as orch
from orchestration.submit import submit_job

os.environ['PROJECT_NAME'] = PROJECT_NAME

In [None]:
# Build Local
os.environ['K8S_ENV'] = 'minikube'
os.environ['DATA_DIR'] = '/'.join(os.getcwd().split("/")[:-2] + ['data'])
! ../../build_scripts/build_local.sh

In [None]:
submit_job(
    orch.standardize_data(
        input_directory="spotify_mpc/raw/spotify_million_playlist_dataset/data",
        output_directory="spotify_mpc/standardized",
        parts=list(range(100))
    )
)

In [None]:
# Build Remote
import boto3
os.environ['AWS_ACCOUNT_ID'] = boto3.client("sts").get_caller_identity()["Account"]
os.environ['K8S_ENV'] = 'eks'
os.environ['DATA_DIR'] = 's3://kube-transform-data-bucket'
! ../../build_scripts/build_eks.sh

In [None]:
! aws s3 cp ../../data/spotify_mpc/standardized s3://kube-transform-data-bucket/spotify_mpc/standardized --recursive

In [None]:
# Create data for feature generation, and create challenge set to evaluate ourselves against
submit_job(
    orch.create_contest(
        standardized_input_directory="spotify_mpc/standardized",
        train_parts=list(range(96)),
        track_df_output_path="spotify_mpc/MPC/track_df.parquet",
        train_output_directory="spotify_mpc/MPC/train",
        test_part=99,
        n_test_cases=5000,
        challenge_set_output_path="spotify_mpc/MPC/challenge_set.parquet",
    )
)

In [None]:
# Create 30k challenge set playlists to train on
for test_part in [96, 97, 98]:
    submit_job(
        orch.create_challenge_set(
            standardized_input_directory="spotify_mpc/standardized",
            test_part=test_part,
            track_df_path="spotify_mpc/MPC/track_df.parquet",
            n_test_cases=10000,
            output_path=f"spotify_mpc/MPC/challenge_set_training_{test_part}.parquet",
        )
    )

In [None]:
! aws s3 cp ../../data/spotify_mpc/raw/spotify_million_playlist_dataset_challenge/challenge_set.json s3://kube-transform-data-bucket/spotify_mpc/raw/spotify_million_playlist_dataset_challenge/challenge_set.json

In [None]:
# Create a challenge set from the real test data.
# We'll use this to create the file we submit to AI Crowd.
submit_job(
    orch.renumber_existing_challenge_set(
        challenge_set_json_path="spotify_mpc/raw/spotify_million_playlist_dataset_challenge/challenge_set.json",
        track_df_path="spotify_mpc/MPC/track_df.parquet",
        output_path=f"spotify_mpc/MPC/challenge_set_real.parquet",
    )
)

In [None]:
submit_job(
    orch.generate_co_dicts(
        train_directory="spotify_mpc/MPC/train",
        challenge_df_paths=[f"spotify_mpc/MPC/challenge_set{suffix}.parquet" for suffix in ["", "_training_96", "_training_97", "_training_98", "_real"]],
        partial_co_dict_output_directory="spotify_mpc/MPC/pco",
    )
)


In [None]:
submit_job(
    orch.reduce_co_partials(
        partial_co_dict_directory="spotify_mpc/MPC/pco",
        co_dict_output_directory="spotify_mpc/MPC/co",
    )
)

In [None]:
for suffix in ["", "_training_96", "_training_97", "_training_98", "_real"]:
    submit_job(
        orch.identify_artist_playlists(
            challenge_df_path=f"spotify_mpc/MPC/challenge_set{suffix}.parquet",
            track_df_path="spotify_mpc/MPC/track_df.parquet",
            output_path=f"spotify_mpc/MPC/artist_pids{suffix}.json",
        )
    )

In [None]:
for suffix in ["", "_training_96", "_training_97", "_training_98", "_real"]:
    submit_job(
        orch.generate_generic_features_fcnn_mfe(
            challenge_df_path=f"spotify_mpc/MPC/challenge_set{suffix}.parquet", #
            track_df_path="spotify_mpc/MPC/track_df.parquet",
            artist_playlists_path=f"spotify_mpc/MPC/artist_pids{suffix}.json", #
            co_dict_directory="spotify_mpc/MPC/co",
            output_directory=f"spotify_mpc/MPC/generic_features_fcnn{suffix}", #
        )
    )


In [None]:
for suffix in ["", "_training_96", "_training_97", "_training_98", "_real"]:
    submit_job(
        orch.generate_track_features_fcnn_mfe(
            challenge_df_path=f"spotify_mpc/MPC/challenge_set{suffix}.parquet", #
            track_df_path="spotify_mpc/MPC/track_df.parquet",
            challenge_df_generic_features_directory=f"spotify_mpc/MPC/generic_features_fcnn{suffix}", #
            co_dict_directory="spotify_mpc/MPC/co",
            output_directory=f"spotify_mpc/MPC/samples_fcnn{suffix}",
        )
    )

NOTE: Train your model with the colab notebook at this point.  Download the submission file and continue.

You can train on:
* samples_fcnn_training_96
* samples_fcnn_training_97
* samples_fcnn_training_98

Then infer on:
* samples_fcnn

Then evaluate this result (like you did for the small contest) to see your results.

Then infer on:
* samples_fcnn_real

And export that file into the format expected by AI Crowd.

In [None]:
submit_job(
    orch.evaluate_submission(
        challenge_df_path="spotify_mpc/MPC/challenge_set.parquet",
        track_df_path="spotify_mpc/MPC/track_df.parquet",
        submission_directory="spotify_mpc/MPC/submission_fcnn",
        output_directory="spotify_mpc/MPC/evaluation_fcnn",
    )
)

In [None]:
### Create a submission file for AI Crowd ###

from execution.generic import file_system_util as fs
import pandas as pd

TEAM_NAME = 'KUBE_TRANSFORM'
TEAM_EMAIL = 'KUBE_TRANSFORM@example.com'

output_filename = "ai_crowd_submission.csv"
sub_df = fs.load_data('spotify_mpc/MPC/submission_fcnn_real.parquet')
challenge_df = fs.load_data('spotify_mpc/MPC/challenge_set_real.parquet')
track_df = fs.load_data('spotify_mpc/MPC/track_df.parquet')

track_id_to_uri = track_df.track_uri.to_dict()

challenge_df = challenge_df.set_index('pid', drop=False)
challenge_df['suggested'] = sub_df['suggested']
challenge_df['suggested_track_uris'] = challenge_df.suggested.apply(
    lambda tracks: [track_id_to_uri[track] for track in tracks]
)

out = challenge_df[['pid', 'suggested_track_uris']].reset_index(drop=True)

# Expand the list into separate columns
df_expanded = pd.DataFrame(out['suggested_track_uris'].to_list()).fillna('')
df_expanded.insert(0, 'pid', out['pid'])

# Define the custom first line
custom_line = f"team_info,{TEAM_NAME},{TEAM_EMAIL}"

# Write to CSV
csv_filename = output_filename
with open(csv_filename, "w") as f:
    f.write(custom_line + "\n")  # Write the custom first line
    df_expanded.to_csv(f, index=False, header=False)