In [None]:
import pandas as pd
import os
from tensorflow.keras import layers, models
import numpy as np

In [None]:
# Run this (or a similar command) if needed to unzip the sample data
! unzip -o drive/MyDrive/spotify-mpc-samples.zip

In [None]:
TRAINING_DIRS = [
    'samples_fcnn_training',
]
TEST_DIRS = [
    'samples_fcnn'
]

In [None]:
def prepare_data(directories):
    all_paths = []
    for directory in directories:
        fns = os.listdir(directory)
        paths = [os.path.join(directory, fn) for fn in fns]
        all_paths.extend(paths)
    df = pd.concat([pd.read_parquet(path) for path in paths])

    N = 60_000 # Change to 950_000 for the full contest dataset
    df['mean_seed_track_occurrences_norm'] = df.mean_seed_track_occurrences / N
    df['mean_ngram_occurrences_norm'] = df.mean_ngram_occurrences / N
    df['num_seed_tracks_norm'] = df.num_seed_tracks / 100
    df['num_ngrams_norm'] = df.num_ngrams / 100
    df['candidate_track_popularity_norm'] = df.candidate_track_popularity / N
    df['candidate_artist_popularity_norm'] = df.candidate_artist_popularity / N


    X_train = df.drop(columns=[
        'is_hidden_track',
        'candidate_track_id',
        'num_seed_tracks',
        'num_ngrams',
        'pid',
        'challenge_type',
        'last_seed',
        'last_seed_pair',
        'last_seed_triple',
        'mean_seed_track_occurrences',
        'mean_ngram_occurrences',
        'candidate_track_popularity',
        'candidate_artist_popularity'

    ])
    print(X_train.columns)
    X_train = X_train.astype(float).values
    y_train = df['is_hidden_track'].astype(float).values
    return df, X_train, y_train

In [None]:
# Define the model class
class FCNNModel(models.Model):
    def __init__(self, input_dim):
        super(FCNNModel, self).__init__()
        # Define hidden layers
        self.dense1 = layers.Dense(128, activation='relu', input_dim=input_dim)
        self.dense2 = layers.Dense(64, activation='relu')
        self.output_layer = layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

# Create an instance of the model
df, X_train, y_train = prepare_data(TRAINING_DIRS)

input_dim = X_train.shape[1]  # Number of features
model = FCNNModel(input_dim)

# Compile the model with binary cross-entropy loss and the Adam optimizer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the data
model.fit(X_train, y_train, epochs=10, batch_size=1024, validation_split=0.1)

In [None]:
df, X_train, y_train = prepare_data(TEST_DIRS)

In [None]:
df['prediction'] = model.predict(X_train, batch_size=1024*32)
df['pid'] = df.pid.astype(int)
df['candidate_track_id'] = df.candidate_track_id.astype(int)

In [None]:
def get_suggestions(group):
    return group.sort_values(
        'prediction', ascending=False
    )[:500].candidate_track_id.values

# Group by 'pid' and apply the function
result = df.groupby('pid').apply(get_suggestions)
suggested = pd.DataFrame(result)
suggested.index.name = None
suggested.columns = ['suggested']

In [None]:
fn = 'submission_MPC_fcnn.parquet'
suggested.to_parquet(fn)

# Once this completes, download the submission file