In [16]:
%pip install xgboost scikit-learn gensim numpy pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:
import gensim.downloader
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD

SVD_EMBEDDING_DIM = 1
MODEL_NAME = "glove-wiki-gigaword-50"
EMBEDDING_MAP = None

# Load pre-trained word vectors
try:
    wv = gensim.downloader.load(MODEL_NAME)
except Exception as e:
    raise e


def embed_genres(data: pd.DataFrame) -> dict[str, np.ndarray]:
    unique_genres = data["track_genre"].dropna().unique()
    tokenized = pd.Series(unique_genres).str.lower().str.split().to_list()

    def avg_vector(tokens):
        vectors = [wv[t] for t in tokens if t in wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(wv.vector_size)

    vectors = np.vstack([avg_vector(tokens) for tokens in tokenized])
    svd_vectors = TruncatedSVD(n_components=SVD_EMBEDDING_DIM).fit_transform(vectors)

    return {genre: svd_vectors[i] for i, genre in enumerate(unique_genres)}


def prepare_X(data: pd.DataFrame) -> pd.DataFrame:
    global EMBEDDING_MAP  # noqa: PLW0603

    if EMBEDDING_MAP is None:
        EMBEDDING_MAP = embed_genres(data)

    col_names = [f"genre_svd_{i}" for i in range(SVD_EMBEDDING_DIM)]
    vec_df = pd.DataFrame(
        data["track_genre"].map(EMBEDDING_MAP).tolist(),
        columns=col_names,
        index=data.index,
    )

    return pd.concat([data.drop(columns=["track_genre"]), vec_df], axis=1)

In [148]:
# Load data
data = pd.read_csv("data/train_data.csv").sort_values("popularity", ascending=False)

# Prepare X and Y
X, Y = prepare_X(data).drop("popularity", axis=1), data["popularity"].values

X.head()

Unnamed: 0,row_id,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre_svd_0,genre_svd_1,genre_svd_2,genre_svd_3
76986,20001,156943,False,0.714,0.472,2,-7.375,1,0.0864,0.013,5e-06,0.266,0.238,131.121,4,4.543162,1.343973,-1.038152,-0.481384
43208,81051,156943,False,0.714,0.472,2,-7.375,1,0.0864,0.013,5e-06,0.266,0.238,131.121,4,4.761148,1.123931,-0.792162,-0.516733
82897,51664,198937,False,0.621,0.782,2,-5.548,1,0.044,0.0125,0.033,0.23,0.55,128.033,4,4.671812,-0.418623,0.189786,0.291518
49138,89411,162637,False,0.835,0.679,7,-5.329,0,0.0364,0.583,2e-06,0.218,0.85,124.98,4,3.887619,-1.421323,0.190487,-1.944378
14902,20008,175238,True,0.561,0.965,7,-3.673,0,0.0343,0.00383,7e-06,0.371,0.304,128.04,4,4.543162,1.343973,-1.038152,-0.481384


In [150]:
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# Train model
model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10000, n_jobs=-1)
model.fit(X_train, Y_train)

# Make predictions
Y_pred = model.predict(X_test)
print("R^2:", r2_score(Y_test, Y_pred))

R^2: 0.5097866058349609


In [140]:
# Retrain full model
model.fit(X, Y)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [141]:
import datetime

# Load test data
data_test = pd.read_csv("data/test_data.csv")

# Prepare test data
X_test = prepare_X(data_test)

# Make predictions
Y_pred = model.predict(X_test)

# Save predictions, index is row_id
Y_pred_df = pd.DataFrame(Y_pred, columns=["popularity"])
Y_pred_df.index = data_test["row_id"]

# Save CSV
filename = "pred/pred_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + ".csv"
Y_pred_df.to_csv(filename, index_label="row_id")
