In [1]:
import wandb
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import f1_score
from wandb.lightgbm import wandb_callback, log_summary



In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33md-a-pop[0m ([33mmidigpt[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

## Hyperparameter Tuning

In [3]:
def parse_data(n):
    # read and prepare data for training
    train = pd.read_csv("train.csv", nrows=n)
    val = pd.read_csv("val.csv", nrows=n)

    train['user_id'] = train['user_id'].astype("category")
    train['review_id'] = train['review_id'].astype("category")
    train['book_id'] = train['book_id'].astype("category")
    train = train.drop("review_text", axis=1)

    val['user_id'] = val['user_id'].astype("category")
    val['review_id'] = val['review_id'].astype("category")
    val['book_id'] = val['book_id'].astype("category")
    val = val.drop("review_text", axis=1)
    
    return train, val

In [4]:
def evaluate_macroF1_lgb(truth, predictions):  
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True)

In [5]:
def train(config):
    # perform one training iteration
    train, val = parse_data(100)
    
    model = lgb.LGBMClassifier(boosting_type=config["boosting_type"], num_leaves=config["num_leaves"], 
                               max_depth=config["max_depth"], learning_rate=config["learning_rate"], 
                               n_estimators=config["n_estimators"], min_child_samples=config["min_child_samples"], 
                               subsample=config["subsample"], colsample_bytree=config["colsample_bytree"], 
                               random_state=config["random_state"], reg_alpha=config["reg_alpha"], 
                               reg_lambda=config["reg_lambda"])
    
    train_config = model.get_params()
    print("TRAIN CONFIG")
    print(train_config)
    
    run = wandb.init(project="Goodreads Books Reviews", entity="d-a-pop", job_type="training", config=train_config)    
    
    gbm = model.fit(train.drop("rating", axis=1), train["rating"], callbacks=[wandb_callback()], \
                     categorical_feature=["user_id", "book_id", "review_id"], eval_metric=evaluate_macroF1_lgb, \
                     eval_set=[(train.drop("rating", axis=1), train["rating"]), (val.drop("rating", axis=1), val["rating"])], \
                     eval_names=["training", "validation"]);

    log_summary(gbm.booster_)
    
    if config["log_preds"]:
        ypred_ = model.predict_proba(val.drop("rating", axis=1))
        predictions = val[["review_id", "rating"]]
        predictions["pred"] = np.argmax(ypred_, axis=1)
        table = wandb.Table(dataframe=predictions)
        wandb.log({"pred_table":table})
    
    run.finish()
    


In [6]:
default_config = {"log_preds":False, "boosting_type":"gbdt", "num_leaves":31, "max_depth":-1, "learning_rate":0.1, 
     "n_estimators":100, "min_child_samples":20, "subsample":1.0, "colsample_bytree":1.0, "random_state":42, 
     "reg_alpha":0, "reg_lambda":0}

In [7]:
train(default_config)

[34m[1mwandb[0m: Currently logged in as: [33md-a-pop[0m. Use [1m`wandb login --relogin`[0m to force relogin


TRAIN CONFIG
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}


New categorical_feature is ['book_id', 'review_id', 'user_id']


[1]	training's multi_logloss: 1.46046	training's macroF1: 0.131291	validation's multi_logloss: 1.45455	validation's macroF1: 0.114435
[2]	training's multi_logloss: 1.40053	training's macroF1: 0.219698	validation's multi_logloss: 1.4424	validation's macroF1: 0.139367
[3]	training's multi_logloss: 1.35607	training's macroF1: 0.255532	validation's multi_logloss: 1.4389	validation's macroF1: 0.150138
[4]	training's multi_logloss: 1.31125	training's macroF1: 0.301984	validation's multi_logloss: 1.43731	validation's macroF1: 0.182005
[5]	training's multi_logloss: 1.27333	training's macroF1: 0.300347	validation's multi_logloss: 1.4401	validation's macroF1: 0.184669
[6]	training's multi_logloss: 1.23517	training's macroF1: 0.30904	validation's multi_logloss: 1.44159	validation's macroF1: 0.190694
[7]	training's multi_logloss: 1.20381	training's macroF1: 0.3264	validation's multi_logloss: 1.44237	validation's macroF1: 0.183278
[8]	training's multi_logloss: 1.16806	training's macroF1: 0.342346	v

[77]	training's multi_logloss: 0.238007	training's macroF1: 1	validation's multi_logloss: 1.96966	validation's macroF1: 0.190264
[78]	training's multi_logloss: 0.231922	training's macroF1: 1	validation's multi_logloss: 1.97782	validation's macroF1: 0.19055
[79]	training's multi_logloss: 0.227993	training's macroF1: 1	validation's multi_logloss: 1.99091	validation's macroF1: 0.190446
[80]	training's multi_logloss: 0.223555	training's macroF1: 1	validation's multi_logloss: 1.99308	validation's macroF1: 0.191009
[81]	training's multi_logloss: 0.218513	training's macroF1: 1	validation's multi_logloss: 1.99851	validation's macroF1: 0.190757
[82]	training's multi_logloss: 0.214624	training's macroF1: 1	validation's multi_logloss: 2.00162	validation's macroF1: 0.190757
[83]	training's multi_logloss: 0.210244	training's macroF1: 1	validation's multi_logloss: 2.01028	validation's macroF1: 0.190757
[84]	training's multi_logloss: 0.206373	training's macroF1: 1	validation's multi_logloss: 2.0146	v

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_macroF1,▁▂▂▃▃▃▄▅▅▅▆▇▇▇▇█████████████████████████
training_multi_logloss,█▇▇▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
validation_macroF1,▁▃▅▅██▇▇▇▇▆▆▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
validation_multi_logloss,▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██

0,1
best_iteration,0.0
iteration,99.0
training_macroF1,1.0
validation_macroF1,0.18651
