In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import mxnet as mx
from mxnet import gluon
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import math

import sys
import os
module_path = os.path.abspath(os.path.join('./'))
if module_path not in sys.path:
    sys.path.append(module_path)

### Load example data; the data is from https://github.com/AmgadMansour/RankNET

In [3]:
data = pd.read_csv('data/example1//article_train_clean.csv',index_col=False)
print(data.shape)
data.rename(columns={'Unnamed: 0':'article_id','shared':'n_shares',}, inplace=True)
#Set article_id as index of the data frame (its values are used as labels for rows)
data.set_index('article_id',inplace=True)

(20005, 60)


In [4]:
q_col = pd.read_excel('data/example1/query_column.xlsx')
q_col = q_col['q_id'].tolist()
print(len(q_col))
data.insert(1, 'q_id',q_col)

20005


In [5]:
#making sure data is loaded correctly
#qid_1(1-24)
#qid_2(25-46)--last n_shares = 1100
#qid_3(47-..)--first n_shares = 776
art_id = 46
print(data.loc[art_id , ['n_shares', 'q_id','c1'] ])
nq = data['q_id'].nunique()  # 360 total queries

n_shares    1100.000000
q_id           2.000000
c1             0.333333
Name: 46, dtype: float64


In [6]:
X_test = data.query('1 <= q_id <= 72')
X_train = data.query('73 <= q_id <= 288')
y_test =  X_test.loc[:,['n_shares','q_id']]
X_test = X_test.drop(["n_shares"], axis = 1)
y_train =  X_train.loc[:,['n_shares','q_id']]
X_train = X_train.drop(["n_shares"], axis = 1)

# Training data prep

In [7]:
train_qid = X_train.q_id.unique()
val_qid = X_test.q_id.unique()

data_lists = dict()

for qid in train_qid:
    data_lists[qid] = dict()
    data_lists[qid]['X'] = X_train[X_train['q_id'] == qid].drop(["q_id"], axis=1).values.astype('float32')
    data_lists[qid]['y'] = y_train[y_train['q_id'] == qid]['n_shares'].values.astype('float32')

for qid in val_qid:
    data_lists[qid] = dict()
    data_lists[qid]['X'] = X_test[X_test['q_id'] == qid].drop(["q_id"], axis=1).values.astype('float32')
    data_lists[qid]['y'] = y_test[y_test['q_id'] == qid]['n_shares'].values.astype('float32')

In [8]:
print(len(data_lists))

288


In [9]:
from rank_utils.data import list2pairs, batcify_func
n_samples = 5000

train_pairs = []
for qid in train_qid:
    train_pairs += list2pairs(data_lists[qid]['X'], data_lists[qid]['y'], n_samples)

val_pairs = []
for qid in val_qid:
    val_pairs += list2pairs(data_lists[qid]['X'], data_lists[qid]['y'], n_samples)

print('n_train', len(train_pairs))
print('n_val', len(val_pairs))

n_train 1080000
n_val 360000


In [10]:
batch_size = 1024

train_ds = gluon.data.SimpleDataset(train_pairs)
train_iter = gluon.data.DataLoader(train_ds, batch_size=batch_size, batchify_fn=batcify_func,
                                      shuffle=True, num_workers=1, last_batch='discard')
val_ds = gluon.data.SimpleDataset(val_pairs)
val_iter = gluon.data.DataLoader(val_ds, batch_size=batch_size, batchify_fn=batcify_func,
                                    shuffle=False, num_workers=1, last_batch='discard')

# Simple training loop

## Model definition

In [11]:
from rank_utils.models import HParamsMLP, ModelMLP, RankNet
hp = HParamsMLP(mlp_layers=[5,5])
mlp = ModelMLP(hp)
model = RankNet(mlp)

Hyperparameters:
{'mlp_act': 'softrelu',
 'mlp_layers': [5,
                5],
 'output_act': None}


## Model initialization and trainer setup

In [12]:
model_ctx = mx.cpu()  # can be mx.gpu(1), mx.gpu(2), etc
model.collect_params().initialize(mx.init.Uniform(.01), ctx=model_ctx)

In [13]:
optimizer_name = 'adam'
optimizer_settings = {'learning_rate': .001, 'wd': 0}
trainer = gluon.Trainer(model.collect_params(), optimizer_name, optimizer_settings)

## Training and evaluation loop

In [14]:
from mxnet import autograd
from tqdm import tqdm_notebook
from rank_utils.evaluations import ndcg
ndcg10 = ndcg(10)

for epoch in range(5):

    train_loss = []
    for b in tqdm_notebook(train_iter):
        with autograd.record():
            loss = model(*b)
        loss.backward()
        trainer.step(batch_size)
        train_loss.append(np.mean(loss.asnumpy()))

    train_ndcgs = []
    for qid in train_qid:
        test_score = model.scorer(mx.nd.array(data_lists[qid]['X']))
        tmp = ndcg10(y_true=data_lists[qid]['y'], y_pred=test_score.asnumpy())
        train_ndcgs.append(tmp)

    test_ndcgs = []
    for qid in val_qid:
        test_score = model.scorer(mx.nd.array(data_lists[qid]['X']))
        tmp = ndcg10(y_true=data_lists[qid]['y'], y_pred=test_score.asnumpy())
        test_ndcgs.append(tmp)

    print(
        'epoch {}: loss (train) {:.4f}; ndcg@10 (train): {:.2f}, ndcg@10 (val): {:.2f}'
        .format(epoch, np.mean(train_loss), np.mean(train_ndcgs), np.mean(test_ndcgs)))

HBox(children=(IntProgress(value=0, max=1054), HTML(value='')))


epoch 0: loss (train) 0.8681; ndcg@10 (train): 0.39, ndcg@10 (val): 0.39


HBox(children=(IntProgress(value=0, max=1054), HTML(value='')))


epoch 1: loss (train) 0.8377; ndcg@10 (train): 0.40, ndcg@10 (val): 0.40


HBox(children=(IntProgress(value=0, max=1054), HTML(value='')))


epoch 2: loss (train) 0.8308; ndcg@10 (train): 0.41, ndcg@10 (val): 0.41


HBox(children=(IntProgress(value=0, max=1054), HTML(value='')))


epoch 3: loss (train) 0.8253; ndcg@10 (train): 0.41, ndcg@10 (val): 0.41


HBox(children=(IntProgress(value=0, max=1054), HTML(value='')))


epoch 4: loss (train) 0.8211; ndcg@10 (train): 0.41, ndcg@10 (val): 0.41
