# Deep NN Implementation 

In [55]:
%reload_ext autoreload
import sys
import os
sys.path.append(os.path.abspath("../../ucl_irdm2017_project2_group1"))

from ltr.data_load import make_rank_data_csv
import ltr.utils
import ltr.evals 

import pandas as pd
import numpy as np

from itertools import combinations
# from collections import Counter
import collections 
import tensorflow as tf 
import time
from sklearn import datasets
from sklearn.model_selection import train_test_split

### Data Preprocessing

* Load in the data and write to csv's
* Take in the features and normalise, so that $x_i \in [-1,1] $
* Remove outlier querys with very high or low associated documents

In [5]:
# Specify the fold from the MSLR-10K dataset you wish to import 
fpath = '../../input/'
fold_no = 1
dataset = ['train', 'vali', 'test']

In [None]:
# Load in dataset and get pandas DataFrames
train = make_rank_data_csv(fpath, fold_no, 'train')
vali = make_rank_data_csv(fpath, fold_no, 'vali')
test = make_rank_data_csv(fpath, fold_no, 'test')

In [None]:
# Make a dataset of all data for normalisation and reset indices
full_data = pd.concat([train,vali,test])
full_data.index = range(full_data.shape[0])

In [None]:
# Get list of unique query ids 
unique_qry = full_data["query_id"].unique()

In [None]:
# Find stats about features in order to normalise
mean_params = []
max_params = []
min_params = []

for q_id in unique_qry:
    query = full_data[full_data['query_id'] == str(q_id)].drop(['label', 'query_id'], axis=1).astype(float)
    average = list(query.mean())
    max_values = list(query.max())
    min_values = list(query.min())
    mean_params.append([q_id] + average)
    max_params.append([q_id] + max_values)
    min_params.append([q_id] + min_values)

In [None]:
mean_cleaned = pd.DataFrame(mean_params)
mean_cleaned.columns = ["query_id"] + ["mean_" + col for col in full_data.columns[2:]]

max_cleaned = pd.DataFrame(max_params)
max_cleaned.columns = ["query_id"] + ["max_" + col for col in full_data.columns[2:]]

min_cleaned = pd.DataFrame(min_params)
min_cleaned.columns = ["query_id"] + ["min_" + col for col in full_data.columns[2:]]

### Normalising the features by query parititons 

In [None]:
norm_features = []
a = full_data.copy()

for index,row in enumerate(a.iterrows()):
    # Getting Query ID
    q_id = row[1]['query_id']
    
    # Normalisation formula: 2*(x - min)/(max - min) - 1
    x = np.array(row[1][2:].astype(float))
    min_val = np.array(min_cleaned[min_cleaned["query_id"] == str(q_id)].drop(["query_id"],axis=1).astype(float))
    max_val = np.array(max_cleaned[max_cleaned["query_id"] == str(q_id)].drop(["query_id"],axis=1).astype(float))
    
    norm_row = (2*(x - min_val)/(max_val - min_val)) - 1

    # Nans indicate division by zero, which means max == min, so setting to zero, Naive fix
    norm_row[np.isnan(norm_row)] = 0.0
    norm_features.append([q_id] + list(norm_row[0]))
    if index%10000 == 0:
        print(index)

### Create pandas dataframe from normalised data, adding label column

In [None]:
cols = list(full_data.columns)
cols.remove('label')

In [None]:
norm_cleaned = pd.DataFrame(norm_features, columns=cols)
norm_cleaned.insert(0,'label', full_data['label'])

### Splitting data back into train / validation / test sets 

In [None]:
clean_train = norm_cleaned.iloc[0:train.shape[0]+1]
clean_val = norm_cleaned.iloc[train.shape[0]+1:train.shape[0]+vali.shape[0]+1]
clean_test = norm_cleaned.iloc[train.shape[0]+vali.shape[0]+1:]

### Getting Filter Keys

In [None]:
clean_train_val = norm_cleaned.iloc[0:(train.shape[0]+vali.shape[0]+1)]
qid_counts = dict(Counter(list(clean_train_val['query_id'])))
filtered_qid_dict = {int(k): v for k, v in qid_counts.items() if v <= 200 and v >= 70}
filtered_qids = filtered_qid_dict.keys()

clean_train_filtered = clean_train[clean_train['query_id'].isin(filtered_qids)]
clean_val_filtered = clean_val[clean_val['query_id'].isin(filtered_qids)]

### Saving splits 

In [None]:
fpath = '../../output'

clean_train_filtered.to_csv("{fpath}/normalised_mslr_train_filtered_fld{fold_no}.csv".format(fpath=fpath,fold_no=fold_no), index=False)
clean_val_filtered.to_csv("{fpath}/normalised_mslr_vali_filtered_fld{fold_no}.csv".format(fpath=fpath,fold_no=fold_no), index=False)

clean_train.to_csv("{fpath}/normalised_mslr_train_fld{fold_no}.csv".format(fpath=fpath,fold_no=fold_no), index=False)
clean_val.to_csv("{fpath}/normalised_mslr_vali_fld{fold_no}.csv".format(fpath=fpath,fold_no=fold_no), index=False)
clean_test.to_csv("{fpath}/normalised_mslr_test_fld{fold_no}.csv".format(fpath=fpath,fold_no=fold_no), index=False)

# DNN 

In [7]:
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)

In [15]:
def init_weights(shape):
    """ Weight initialization """
    weights = tf.random_normal(shape,stddev=0.8)
    return tf.Variable(weights)

def forwardprop(X, w_1, w_2, w_3, w_4, w_5, biases_h, biases_h1, biases_h2, biases_h3, biases_y):
    """
    Forward-propagation.
    IMPORTANT: yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.
    """
    h   = tf.nn.dropout(tf.nn.relu(tf.matmul(X, w_1) + biases_h), keep_prob)  # The \sigma function
    h1 = tf.nn.dropout(tf.nn.relu(tf.matmul(h,w_2) + biases_h1), keep_prob)
    h2 = tf.nn.dropout(tf.nn.relu(tf.matmul(h1,w_3) + biases_h2), keep_prob)
    h3 = tf.nn.dropout(tf.nn.relu(tf.matmul(h2,w_4) + biases_h3), keep_prob)
    
    yhat = tf.matmul(h3, w_5) + biases_y  # The \varphi function
    return yhat

def unison_shuffled_copies(a, b, c):
    assert len(a) == len(b)
    assert len(b) == len(c)
    p = np.array(np.random.permutation(len(a)))
    return a[p], b[p], c[p]

def get_doc_data(fpath, fold_no, sample_lim=5255):
    train_data = pd.read_csv("{fpath}/normalised_mslr_train_fld{fold_no}.csv".format(fpath=fpath, fold_no=fold_no))
    train_data = train_data.dropna()
    
    validation_data = pd.read_csv("{fpath}/normalised_mslr_vali_fld{fold_no}.csv".format(fpath=fpath, fold_no=fold_no))
    validation_data = validation_data.dropna()
    
    data = pd.concat([train_data,validation_data])
    
    zero_label = data[data["label"]==0].iloc[0:sample_lim]
    one_label = data[data["label"]==1].iloc[0:sample_lim]
    two_label = data[data["label"]==2].iloc[0:sample_lim]
    three_label = data[data["label"]==3].iloc[0:sample_lim]
    four_label = data[data["label"]==4].iloc[0:sample_lim]
    data = pd.concat([zero_label,one_label,two_label,three_label,four_label]).sample(frac=1, random_state=0)
    all_X = data[data.columns.difference(["label","query_id"])]
    all_Y = data[["label","query_id"]]
    num_classes = len([int(i) for i in list(all_Y["label"].unique())])

    vec_Y = []
    for index,label in all_Y["label"].iteritems():
        vec_Y.append([0 if int(label) != element else 1 for element in range(num_classes)])
    vec_qid = np.array(all_Y["query_id"])
    vec_X = np.array(all_X)
    return vec_X, vec_Y, vec_qid

def get_test_data(fpath, fold_no):
    test_full_data = pd.read_csv("{fpath}/normalised_mslr_test_fld{fold_no}.csv".format(fold_no=fold_no, fpath=fpath))
    data = test_full_data.dropna()
    
    all_X = data[data.columns.difference(["label","query_id"])]
    all_Y = data[["label","query_id"]]
    num_classes = len([int(i) for i in list(all_Y["label"].unique())])

    #vec_Y = all_Y.apply(lambda x: [0 if x != label else 1 for label in range(num_classes)])
    vec_Y = []
    for index,label in all_Y["label"].iteritems():
        vec_Y.append([0 if int(label) != element else 1 for element in range(num_classes)])
    vec_qid = np.array(all_Y["query_id"])
    vec_X = np.array(all_X)
    
    return vec_X, vec_Y, vec_qid

def adaptiveLearningRate(n, err_tmin1, err_t, rep):
    # Set parameters for adapting learning rate. Sets threshholds
    n_inc_ratio = 1
    n_inc = 1.005
    n_max = 1e3
    n_dec_ratio = 1.05
    n_dec = 0.3
    n_min = 1e-6
    rep_max = 10
    
    # Finds of previous and current error
    R = err_tmin1 / err_t
    print('R: ', R)

    # update weights
    # new error if lower, learning rate increased
    # new error is greater, learning rate decreased
    if R < n_inc_ratio:
        if n < n_max:
            n = min(n_max, n*n_inc)
    elif R > n_dec_ratio:
        if n > n_min:
            if rep < rep_max:
                # RESTORE OLD WEIGHTS
                rep += 1
                n = max(n_min, n*n_dec)
                
    return n, rep 

### Load in Normalised Values from csv Files 

In [17]:
fpath = '../../output'
fold_no = 1

vec_X,vec_Y,vec_qid  = get_doc_data(fpath, fold_no)
test_x, test_y, test_qid = get_test_data(fpath, fold_no)

### Set model parameters 

In [64]:
# Epoch and btch sizes
batch_size = 200
num_epochs = 1

# Layer's sizes
x_size = vec_X.shape[1]   # Number of input nodes: 4 features and 1 bias
h_size = 50               # Number of hidden nodes in first hidden layer
h1_size = 40                # Number of hidden nodes in first hidden layer
h2_size = 30                # Number of hidden nodes in third hidden laye
h3_size  = 20
beta= 0.1
y_size = len(vec_Y[0])   # Number of outcomes

# Dropout = 1 - keep probability (kp)
kp = 0.5

# Set adaptive_lr to 1 if you want to use adaptive learning rate 
adaptive_lr = 0
lr = 0.0004

# Variable
biases_h = tf.Variable(tf.zeros([h_size]))
biases_h1 = tf.Variable(tf.zeros([h1_size]))
biases_h2 = tf.Variable(tf.zeros([h2_size]))
biases_h3 = tf.Variable(tf.random_normal([h3_size]))
biases_y = tf.Variable(tf.zeros([y_size]))

# Symbols
X = tf.placeholder("float", shape=[None, x_size])
y = tf.placeholder("float", shape=[None, y_size])
keep_prob = tf.placeholder(tf.float32)
learning_rate = tf.placeholder(tf.float32, shape=[])

# Weight initializations
w_1 = init_weights((x_size, h_size))
w_2 = init_weights((h_size, h1_size))
w_3 = init_weights((h1_size, h2_size))
w_4 = init_weights((h2_size, h3_size))
w_5 = init_weights((h3_size, y_size))

# Forward propagation
yhat    = forwardprop(X, w_1, w_2, w_3, w_4, w_5, biases_h, biases_h1, biases_h2, biases_h3, biases_y)
confidence = tf.nn.softmax(yhat)
predict = tf.argmax(yhat, axis=1)

# Backward propagation
cost    = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=confidence) +
    beta*tf.nn.l2_loss(w_1) + beta*tf.nn.l2_loss(biases_h) +
    beta*tf.nn.l2_loss(w_2) + beta*tf.nn.l2_loss(biases_h1) + 
    beta*tf.nn.l2_loss(w_3) + beta*tf.nn.l2_loss(biases_h2) +
    beta*tf.nn.l2_loss(w_4) + beta*tf.nn.l2_loss(biases_h3) +
    beta*tf.nn.l2_loss(w_5) + beta*tf.nn.l2_loss(biases_y))
    
# Optimiser     
updates = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

### Running of Model

In [65]:
sess = tf.Session()
init = tf.global_variables_initializer()

# Initialiasation for adaptive learnign weights
rep = 0
err_min1 = 0

# Creating saver
saver = tf.train.Saver()
sess.run(init)

for epoch in range(0, num_epochs):
    # New Permutation
    temp_vec_X,temp_vec_Y, tempvec_qid = unison_shuffled_copies(np.array(vec_X),np.array(vec_Y), np.array(vec_qid))

    # Train with each example
    for i in range(len(vec_X)):
        sess.run(updates, feed_dict={X: vec_X[i: i + batch_size], y: vec_Y[i: i + batch_size], learning_rate: lr, keep_prob: kp})
    
    # Get accuracy results for train and test data 
    train_predictions = sess.run(predict, feed_dict={X: vec_X, y: vec_Y, keep_prob: 1})
    train_accuracy = np.mean(np.argmax(vec_Y, axis=1) == train_predictions)
    train_pred_composition = collections.Counter(train_predictions)
    
    test_predictions = sess.run(predict, feed_dict={X: test_x, y: test_y, keep_prob: 1})
    test_accuracy  = np.mean(np.argmax(test_y, axis=1) == test_predictions)
    test_pred_composition = collections.Counter(test_predictions)

    comp = zip(list(np.argmax(test_y, axis=1)),list(test_predictions))
    print(comp[0:100])
    right = sum([1 if x[0] == 4 and x[1] == 4 else 0 for x in comp])
    wrong = sum([1 if (x[0] == 4 or x[1] == 4) and x[0] != x[1] else 0 for x in comp])
    
    print("Epoch = %d, train accuracy = %.2f%%, test accuracy = %.10f%%, 4 Label Accuracy = % .2f%%"
          % (epoch + 1, 100. * train_accuracy, 100. * test_accuracy, 100. * (right/(right+wrong))))

    print("Training Pred Composition")
    print([(x,train_pred_composition[x]) for x in range(5)])
    print("Test pred Composition")
    print([(x,test_pred_composition[x]) for x in range(5)])
    
    # Update Learning Rate
    if adaptive_lr == 1:
        err = (1 - train_accuracy)
        lr, rep = adaptiveLearningRate(lr, err_min1, err, rep)
        err_min1 = err
        print('error rate: ', err, 'lr: ', lr)
    else:
        lr = lr
    
    # Take final run and save predicitions for later analysis
    if epoch == num_epochs-1:
        print('we are here')
        predictions = sess.run(confidence, feed_dict={X: test_x, y: test_y, keep_prob: 1.0})
        raw_output = sess.run(yhat, feed_dict={X: test_x, y: test_y, keep_prob: 1.0})
        
        # Getting confidence levels
        confi = list(np.max(predictions,axis=1))

        # Getting label predictions
        predictions = [list(x) for x in predictions]
        pred_labels = [pred.index(max(pred)) for pred in predictions]

        # Getting target labels
        true_labels = list(np.argmax(test_y, axis=1))

        # Get query ID's
        test_qid = [int(x) for x in test_qid]

        # Creating dataframe out model outputs
        model_result = pd.DataFrame({"qid": test_qid, "label_true": true_labels,"label_pred": pred_labels, "confidence": confi})
        
        # Sorting them into rank lists using predicted label and confidence
        sorted_predictions = model_result.sort_values(["qid", "label_pred","confidence"], ascending=False)
        sorted_predictions.index = range(sorted_predictions.shape[0])

        # Saving the results to file
        sorted_predictions.to_csv("../../output/Model_Predictions.csv", index=0) 

sess.close()

[(1, 3), (3, 3), (1, 3), (0, 3), (0, 3), (1, 3), (0, 4), (0, 4), (2, 4), (1, 1), (1, 3), (2, 1), (2, 4), (1, 4), (2, 4), (1, 1), (2, 4), (0, 4), (1, 3), (0, 3), (1, 1), (0, 4), (0, 4), (1, 4), (0, 4), (0, 3), (2, 4), (2, 3), (2, 4), (1, 4), (1, 4), (0, 4), (0, 4), (1, 4), (0, 3), (0, 1), (1, 1), (3, 4), (0, 3), (1, 4), (1, 4), (1, 3), (0, 4), (1, 4), (1, 4), (1, 3), (0, 4), (2, 4), (0, 4), (2, 3), (1, 1), (0, 4), (0, 4), (0, 1), (2, 4), (1, 4), (1, 4), (1, 1), (3, 3), (2, 4), (0, 3), (0, 3), (1, 4), (0, 3), (2, 4), (1, 4), (2, 1), (1, 4), (2, 4), (1, 3), (0, 4), (1, 4), (3, 4), (1, 3), (0, 3), (3, 4), (0, 4), (0, 4), (1, 3), (1, 4), (0, 3), (1, 3), (1, 4), (0, 1), (1, 4), (3, 4), (1, 3), (2, 3), (2, 4), (0, 4), (0, 3), (2, 4), (0, 3), (0, 4), (2, 3), (1, 3), (2, 4), (1, 4), (2, 1), (1, 4)]
Epoch = 1, train accuracy = 18.26%, test accuracy = 11.2930606161%, 4 Label Accuracy =  0.00%
Training Pred Composition
[(0, 0), (1, 8543), (2, 2), (3, 5588), (4, 12142)]
Test pred Composition
[(0, 0

### Evaluate Model

In [71]:
def score_predictions(model_result):
#     Get list of query ids
    unique_qid = model_result['qid'].unique()
    
    # Create data frame to store results for each q_id
    columns = ['qid','ndcg', 'err']
    score_rec = pd.DataFrame(columns=columns)
    
    for qid in unique_qid:
        # Get list of all documents for a given qid
        qid_list = model_result[model_result['qid']==qid]
        qid_list = qid_list.sort_values(by=['label_pred', 'confidence'], ascending=[0, 0])

        ndcg = ltr.evals.ndcg_at_rank_N(qid_list['label_true'], 10)
        err = ltr.evals.err(qid_list['label_true'], 10)
        row = [qid, ndcg, err]

        score_rec = score_rec.append(pd.Series(row, index=columns), ignore_index=True)
    
    return score_rec

In [74]:
model_result = pd.read_csv("../../output/Model_Predictions.csv")

In [75]:
score_rec = score_predictions(model_result)
score_rec

Unnamed: 0,qid,ndcg,err
0,29998.0,0.322430,0.299049
1,29983.0,0.126168,0.210681
2,29968.0,0.144978,0.167099
3,29953.0,0.271724,0.186647
4,29938.0,0.100717,0.112075
5,29923.0,0.025387,0.097872
6,29908.0,0.108498,0.147453
7,29893.0,0.000000,0.085073
8,29878.0,0.216891,0.251053
9,29863.0,0.157981,0.180079


In [76]:
ndcg_mean = np.mean(score_rec['ndcg'])
ndcg_std = np.std(score_rec['ndcg'])

err_mean = np.mean(score_rec['err'])
err_std = np.std(score_rec['err'])

print('ndcg: ', ndcg_mean, ' +- ', ndcg_std)
print('err: ', err_mean, ' +- ', err_std)

('ndcg: ', 0.18066535450125656, ' +- ', 0.15397212837600135)
('err: ', 0.20427385157148484, ' +- ', 0.12588851062884993)
