# Training Model with Padded Investors

This notebook was run on collab in order to fit all the data in RAM. This dataset includes all the investor data as well as the synthetic investors that were added to normalise the distributions of investor counts.

The validation set was pulled from the clean non padded dataset such that we don't pullout our metrics with synthetic data.

In [1]:
import json
import re
import random
import matplotlib.pyplot as plt
import lightgbm as lgb
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime
from collections import defaultdict
from sqlalchemy import select, func
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Paths to files in Drive
db_path = "/content/drive/MyDrive/Colab Notebooks/database.db"
train_ids_path = "/content/train_ids.csv"
val_ids_path = "/content/validation_ids.csv"

Pull whole dataset from SQL db into pandas frame.

In [4]:
conn = sqlite3.connect(db_path)  # or your local path
full_df = pd.read_sql_query("SELECT * FROM feature_matrix", conn)

Saved the training and validation IDs in a CSV so we can split the data set.

In [6]:
train_ids = pd.read_csv(train_ids_path)["train_ids"].dropna().astype(int).tolist()
val_ids = pd.read_csv(val_ids_path)["validation_ids"].dropna().astype(int).tolist()

Validation dataset pulled from full set, remianing is training set. This is to ensure that we don't train on validation set.

In [7]:
val_ids_set = set(val_ids)
val_df = full_df[full_df["clean_row_id"].isin(val_ids_set)]
full_df = full_df[~full_df["clean_row_id"].isin(val_ids_set)]

This was pulled from the initial model development notebook.

In [8]:
def compute_ranking_metrics(df, k=3):
    # Group
    grouped = df.groupby("clean_row_id")

    # Get top 1 and calculate accuracy
    top1 = grouped.apply(lambda g: g.loc[g["score"].idxmax()]).reset_index(drop=True)
    acc1 = (top1["label"] == 1).mean()

    # Same with recall
    topk = grouped.apply(lambda g: g.nlargest(k, "score")).reset_index(drop=True)
    recall_k = topk.groupby("clean_row_id")["label"].max().mean()

    # MMR
    def reciprocal_rank(g):
        sorted_g = g.sort_values("score", ascending=False).reset_index()
        match = sorted_g[sorted_g["label"] == 1]
        return 1.0 / (match.index[0] + 1) if not match.empty else 0.0

    mrr = grouped.apply(reciprocal_rank).mean()
    return acc1, recall_k, mrr

This is so we can save logs to file in case collab logs us out and we lose our printouts.

In [9]:
# Paths
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = f"/content/drive/MyDrive/lgb_log_{timestamp}.txt"
model_path = f"/content/drive/MyDrive/lgb_model_{timestamp}.txt"

# Logger
log_file = open(log_path, "w")


def log_callback(period=10):
    def _callback(env):
        if env.iteration % period == 0 or env.iteration + 1 == env.end_iteration:
            result = f"[{env.iteration}] "
            for data_name, eval_name, result_val, _ in env.evaluation_result_list:
                result += f"{data_name} {eval_name}: {result_val:.5f}  "
            print(result)
            log_file.write(result + "\n")
            log_file.flush()

    return _callback

From the model development notebook, changed slightly to fit our dataframes.

In [12]:
def train_model(
    train_ids,
    val_ids,
    parameters: dict,
    n_rounds: int = 100,
    lr_decay_gamma: float = 0.95,
):
    conn = sqlite3.connect(db_path)
    # Load validation set
    X_val = val_df.drop(
        columns=["label", "clean_row_id", "investor", "firm", "template_id"]
    )
    y_val = val_df["label"]
    val_group_sizes = val_df.groupby("clean_row_id").size().tolist()
    lgb_val = lgb.Dataset(
        X_val, label=y_val, group=val_group_sizes, free_raw_data=False
    )

    # Load full training set
    full_df
    X_train = full_df.drop(
        columns=["label", "clean_row_id", "investor", "firm", "template_id"]
    )
    y_train = full_df["label"]
    train_group_sizes = full_df.groupby("clean_row_id").size().tolist()
    lgb_train = lgb.Dataset(
        X_train, label=y_train, group=train_group_sizes, free_raw_data=False
    )

    # Learning rate schedule
    def lr_decay(current_round):
        return parameters["learning_rate"] * (lr_decay_gamma**current_round)

    # Train model
    model = lgb.train(
        params=parameters,
        train_set=lgb_train,
        num_boost_round=n_rounds,
        valid_sets=[lgb_train, lgb_val],
        valid_names=["train", "val"],
        callbacks=[
            lgb.reset_parameter(learning_rate=lr_decay),
            lgb.early_stopping(stopping_rounds=20),
            lgb.log_evaluation(period=1),
            log_callback(10),
        ],
    )

    # Predict and Evaluate
    preds = model.predict(X_val, num_iteration=model.best_iteration)
    val_df["score"] = preds

    acc1, recall3, mrr = compute_ranking_metrics(val_df, k=3)

    print("\Evaluation Metrics (Validation Set):")
    print(f"Accuracy@1 : {acc1:.4f}")
    print(f"Recall@3   : {recall3:.4f}")
    print(f"MRR        : {mrr:.4f}")

    return model

In [10]:
params = {
    "objective": "lambdarank",
    "metric": ["ndcg"],
    "eval_at": [1, 3],
    "label_gain": [0, 1],
    "learning_rate": 0.1,
    "num_leaves": 31,
    "max_depth": 6,
    "min_split_gain": 1e-3,
    "min_child_weight": 1e-2,
    "max_delta_step": 1.0,
    "scale_pos_weight": 294,
    "verbosity": -1,
    "boosting_type": "gbdt",
    "force_row_wise": True,
}

In [13]:
# Retrain
model = train_model(train_ids, val_ids, params, n_rounds=100)
log_file.close()
model.save_model(model_path)

[0] train ndcg@1: 0.88226  train ndcg@3: 0.94278  val ndcg@1: 0.71994  val ndcg@3: 0.84847  
[1]	train's ndcg@1: 0.882255	train's ndcg@3: 0.942779	val's ndcg@1: 0.719942	val's ndcg@3: 0.848469
Training until validation scores don't improve for 20 rounds
[2]	train's ndcg@1: 0.888038	train's ndcg@3: 0.946874	val's ndcg@1: 0.739981	val's ndcg@3: 0.861974
[3]	train's ndcg@1: 0.895414	train's ndcg@3: 0.955466	val's ndcg@1: 0.766191	val's ndcg@3: 0.892616
[4]	train's ndcg@1: 0.895566	train's ndcg@3: 0.955594	val's ndcg@1: 0.767092	val's ndcg@3: 0.893209
[5]	train's ndcg@1: 0.906558	train's ndcg@3: 0.961495	val's ndcg@1: 0.803148	val's ndcg@3: 0.913123
[6]	train's ndcg@1: 0.906558	train's ndcg@3: 0.961495	val's ndcg@1: 0.803148	val's ndcg@3: 0.913123
[7]	train's ndcg@1: 0.906558	train's ndcg@3: 0.961468	val's ndcg@1: 0.803148	val's ndcg@3: 0.913149
[8]	train's ndcg@1: 0.914675	train's ndcg@3: 0.965774	val's ndcg@1: 0.832201	val's ndcg@3: 0.929222
[9]	train's ndcg@1: 0.915017	train's ndcg@3: 0

  top1 = grouped.apply(lambda g: g.loc[g["score"].idxmax()]).reset_index(drop=True)
  topk = grouped.apply(lambda g: g.nlargest(k, "score")).reset_index(drop=True)


\Evaluation Metrics (Validation Set):
Accuracy@1 : 0.8500
Recall@3   : 0.9940
MRR        : 0.9206


  mrr = grouped.apply(reciprocal_rank).mean()


<lightgbm.basic.Booster at 0x7b50c17edbd0>

This is a drastic improvement.

NDCG@1: 0.84995 - precision at top 1 spot is correct almost 85% of the time.

NDCG@3: 0.93865 - at top3, it is correct almost 93% of the time.

Accuracy@1: 0.8500, Recall@3: 0.9940 - indicating the same thing, most of the time our top three ranking predictions contains the correct template.

Next thing to do is pull the incorrectly predicted ids and try to identify any trends in them.


In [14]:
def get_failed_topk_ids(df, k=3):
    grouped = df.groupby("clean_row_id")

    # Get top-k by score
    topk = grouped.apply(lambda g: g.nlargest(k, "score")).reset_index(drop=True)

    # Group again to check if any of the top-k have the correct label
    has_hit = topk.groupby("clean_row_id")["label"].max()  # 1 if hit, 0 if miss
    failed_ids = has_hit[has_hit == 0].index.tolist()

    return failed_ids

In [15]:
failed_ids = get_failed_topk_ids(val_df, k=3)
print(
    f"{len(failed_ids)} validation groups failed to rank the correct template in top {3}."
)

87 validation groups failed to rank the correct template in top 3.


Time to save and i will analyse off line.

In [16]:
pd.DataFrame({"clean_row_id": failed_ids}).to_csv("failed_top3_ids.csv", index=False)