# Deep NN Implementation 

In [58]:
%reload_ext autoreload
import sys
import os
sys.path.append(os.path.abspath("../../ucl_irdm2017_project2_group1"))

from ltr.data_load import make_rank_data_csv
import ltr.dnn_utils
# import ltr.evals

import pandas as pd
import numpy as np

from itertools import combinations
from collections import Counter
import tensorflow as tf 
# from IPython.core.display import clear_ouptput
import time
from sklearn import datasets
from sklearn.model_selection import train_test_split

### Data Preprocessing

* Load in the data and write to csvs
* Take in the features and normalise, scaling from 0-1
* Remove outlier querys with very high or low associated documents

In [4]:
# Specify the fold from the MSLR-10K dataset you wish to import 
fpath = '../../input/'
fold_no = 1
dataset = ['train', 'vali', 'test']

In [5]:
# Load in dataset and get pandas DataFrames
train = make_rank_data_csv(fpath, fold_no, 'train')
vali = make_rank_data_csv(fpath, fold_no, 'vali')
test = make_rank_data_csv(fpath, fold_no, 'test')

In [12]:
# Make a dataset of all data for normalisation and reset indices
full_data = pd.concat([train,vali,test])
full_data.index = range(full_data.shape[0])

In [13]:
# Get list of unique query ids 
unique_qry = full_data["query_id"].unique()

In [None]:
# Find stats about features in order to normalise
mean_params = []
max_params = []
min_params = []

for q_id in unique_qry:
    query = full_data[full_data['query_id'] == q_id].drop(['label', 'query_id'], axis=1)
    average = list(query.mean())
    max_values = list(query.max())
    min_values = list(query.min())
    mean_params.append([q_id] + average)
    max_params.append([q_id] + max_values)
    min_params.append([q_id] + min_values)

In [None]:
mean_cleaned = pd.DataFrame(mean_params)
mean_cleaned.columns = ["query_id"] + ["mean_" + col for col in full_data.columns[2:]]

max_cleaned = pd.DataFrame(max_params)
max_cleaned.columns = ["query_id"] + ["max_" + col for col in full_data.columns[2:]]

min_cleaned = pd.DataFrame(max_params)
min_cleaned.columns = ["query_id"] + ["max_" + col for col in full_data.columns[2:]]


In [92]:
mean_cleaned

Unnamed: 0,query_id,mean_covered query term number - body,mean_covered query term number - anchor,mean_covered query term number - title,mean_covered query term number - url,mean_covered query term number - whole document,mean_covered query term ratio - body,mean_covered query term ratio - anchor,mean_covered query term ratio - title,mean_covered query term ratio - url,...,mean_Length of URL,mean_Inlink number,mean_Outlink number,mean_PageRank,mean_SiteRank,mean_QualityScore,mean_QualityScore2,mean_Query-url click count,mean_url click count,mean_url dwell time
0,1,,,,,,,,,,...,,,,,,,,,,
1,16,,,,,,,,,,...,,,,,,,,,,
2,31,,,,,,,,,,...,,,,,,,,,,
3,46,,,,,,,,,,...,,,,,,,,,,
4,61,,,,,,,,,,...,,,,,,,,,,
5,76,,,,,,,,,,...,,,,,,,,,,
6,91,,,,,,,,,,...,,,,,,,,,,
7,106,,,,,,,,,,...,,,,,,,,,,
8,121,,,,,,,,,,...,,,,,,,,,,
9,136,,,,,,,,,,...,,,,,,,,,,


### Normalising the features by query parititons 

In [84]:
c = full_data.ix[0:10]

for index,row in enumerate(c.iterrows()):
    q_id = row[1]['query_id']
    
    max_vals = np.array(max_cleaned[max_cleaned['query_id']==q_id].drop(['query_id'],axis=1))
    min_vals = np.array(max_cleaned[max_cleaned['query_id']==q_id].drop(['query_id'],axis=1))
        
    norm_row = ((2*(row[1][2:])) / (max_vals - min_vals)) - 1
    
for v in max_vals[0]: print(type(v))

covered query term number - body                             3
covered query term number - anchor                           3
covered query term number - title                            0
covered query term number - url                              0
covered query term number - whole document                   3
covered query term ratio - body                              1
covered query term ratio - anchor                            1
covered query term ratio - title                             0
covered query term ratio - url                               0
covered query term ratio - whole document                    1
stream length - body                                       156
stream length - anchor                                       4
stream length - title                                        0
stream length - url                                          7
stream length - whole document                             167
IDF(Inverse document frequency) - body                6

TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [89]:
norm_features = []

for index,row in enumerate(full_data.iterrows()):
    # Getting Query ID
    q_id = row[1]['query_id']
    
    for elem in row:
        print(elem, type(elem))
    
    print(q_id)
    # Normalisation formula: 2*(x - min)/(max - min) - 1
    norm_row = np.array(2*((np.array(row[1][2:]) - \
    np.array(min_cleaned[min_cleaned["query_id"]==q_id].drop(["query_id"],axis=1))[0])) / \
    ((np.array(max_cleaned[max_cleaned["query_id"] == q_id].drop(["query_id"],axis=1))[0]) - \
    (np.array(min_cleaned[min_cleaned["query_id"] == q_id].drop(["query_id"],axis=1))[0]))-([1]*136))

    # Nans indicate division by zero, which means max == min, so setting to zero, Naive fix
    norm_row[np.isnan(norm_row)] = 0.0
    norm_features.append([q_id] + list(norm_row))
    if index%10000 == 0:
        print(index)

(0, <type 'numpy.int64'>)
(label                                                        2
query_id                                                     1
covered query term number - body                             3
covered query term number - anchor                           3
covered query term number - title                            0
covered query term number - url                              0
covered query term number - whole document                   3
covered query term ratio - body                              1
covered query term ratio - anchor                            1
covered query term ratio - title                             0
covered query term ratio - url                               0
covered query term ratio - whole document                    1
stream length - body                                       156
stream length - anchor                                       4
stream length - title                                        0
stream length - url         

TypeError: unsupported operand type(s) for -: 'str' and 'float'

### Create pandas dataframe from normalised data, adding label column

In [None]:
norm_cleaned = pd.DataFrame(norm_features)

In [None]:
norm_cleaned.insert(0,'label', cleaned['label'])
norm_cleaned.insert(0,'Unnamed: 0', cleaned['Unnamed: 0'])
norm_cleaned.columns = cleaned.columns
del norm_cleaned['Unnamed: 0']

### Splitting data back into train / validation / test sets 

In [None]:
clean_train = norm_cleaned.iloc[0:train.shape[0]+1]
clean_val = norm_cleaned.iloc[train.shape[0]+1:train.shape[0]+vali.shape[0]+1]
clean_test = norm_cleaned.iloc[train.shape[0]+vali.shape[0]+1:]

### Getting Filter Keys

In [None]:
clean_train_val = norm_cleaned.iloc[0:723412+235259]
qid_counts = dict(Counter(list(clean_train_val['query_id'])))
filtered_qid_dict = {int(k): v for k, v in qid_counts.items() if v <= 200 and v >= 70}
filtered_qids = filtered_qid_dict.keys()

clean_train_filtered = clean_train[clean_train['query_id'].isin(filtered_qids)]
clean_val_filtered = clean_val[clean_val['query_id'].isin(filtered_qids)]

### Saving splits 

In [None]:
clean_train_filtered.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_train_filtered_fld1.csv", index=False)
clean_val_filtered.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_vali_filtered_fld1.csv", index=False)

clean_train.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_train_fld1.csv", index=False)
clean_val.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_vali_fld1.csv", index=False)
clean_test.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_test_fld1.csv", index=False)

# DNN 

In [59]:
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)

In [None]:
def init_weights(shape):
    """ Weight initialization """
    weights = tf.random_normal(shape,stddev=0.8)
    return tf.Variable(weights)

def unison_shuffled_copies(a, b, c):
    assert len(a) == len(b)
    assert len(b) == len(c)
    p = np.array(np.random.permutation(len(a)))
    return a[p], b[p], c[p]

def forwardprop(X, w_1, w_2, w_3, w_4, w_5, biases_h, biases_h1, biases_h2, biases_h3, biases_y):
    """
    Forward-propagation.
    IMPORTANT: yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.
    """
    h   = tf.nn.dropout(tf.nn.relu(tf.matmul(X, w_1) + biases_h), keep_prob)  # The \sigma function
    h1 = tf.nn.dropout(tf.nn.relu(tf.matmul(h,w_2) + biases_h1), keep_prob)
    h2 = tf.nn.dropout(tf.nn.relu(tf.matmul(h1,w_3) + biases_h2), keep_prob)
    h3 = tf.nn.dropout(tf.nn.relu(tf.matmul(h2,w_4) + biases_h3), keep_prob)
    #h4 = tf.nn.dropout(tf.nn.relu(tf.matmul(h3,w_5) + biases_h4), keep_prob)
    
    #yhat = tf.matmul(h2, w_4) + biases_y
    yhat = tf.matmul(h3, w_5) + biases_y  # The \varphi function
    return yhat

def get_Doc_Data():
    train_data = pd.read_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_train_fld1.csv")
    train_data = train_data.dropna()
    
    validation_data = pd.read_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_vali_fld1.csv")
    validation_data = validation_data.dropna()
    
    data = pd.concat([train_data,validation_data])
    
    zero_label = data[data["label"]==0].iloc[0:5255]
    one_label = data[data["label"]==1].iloc[0:5255]
    two_label = data[data["label"]==2].iloc[0:5255]
    three_label = data[data["label"]==3].iloc[0:5255]
    four_label = data[data["label"]==4].iloc[0:5255]
    data = pd.concat([zero_label,one_label,two_label,three_label,four_label]).sample(frac=1, random_state=0)
    all_X = data[data.columns.difference(["label","query_id"])]
    all_Y = data[["label","query_id"]]
    num_classes = len([int(i) for i in list(all_Y["label"].unique())])

    #vec_Y = all_Y.apply(lambda x: [0 if x != label else 1 for label in range(num_classes)])
    vec_Y = []
    for index,label in all_Y["label"].iteritems():
        vec_Y.append([0 if int(label) != element else 1 for element in range(num_classes)])
    vec_qid = np.array(all_Y["query_id"])
    vec_X = np.array(all_X)
    return vec_X, vec_Y, vec_qid

def get_test_Data():
    test_full_data = pd.read_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_test_fld1.csv")
    data = test_full_data.dropna()
    
    all_X = data[data.columns.difference(["label","query_id"])]
    all_Y = data[["label","query_id"]]
    num_classes = len([int(i) for i in list(all_Y["label"].unique())])

    #vec_Y = all_Y.apply(lambda x: [0 if x != label else 1 for label in range(num_classes)])
    vec_Y = []
    for index,label in all_Y["label"].iteritems():
        vec_Y.append([0 if int(label) != element else 1 for element in range(num_classes)])
    vec_qid = np.array(all_Y["query_id"])
    vec_X = np.array(all_X)
    
    return vec_X, vec_Y, vec_qid

def adaptiveLearningRate(n, err_tmin1, err_t, rep):
    # Set parameters for adapting learning rate. Sets threshholds
    n_inc_ratio = 1
    n_inc = 1.005
    n_max = 1e3
    n_dec_ratio = 1.05
    n_dec = 0.3
    n_min = 1e-6
    rep_max = 10
    
    # Finds of previous and current error
    R = err_tmin1 / err_t
    print('R: ', R)

    # update weights
    # new error if lower, learning rate increased
    # new error is greater, learning rate decreased
    if R < n_inc_ratio:
        if n < n_max:
            n = min(n_max, n*n_inc)
    elif R > n_dec_ratio:
        if n > n_min:
            if rep < rep_max:
                # RESTORE OLD WEIGHTS
                rep += 1
                n = max(n_min, n*n_dec)
                
    return n, rep 

In [None]:
vec_X,vec_Y,vec_qid  = get_Doc_Data()

In [None]:
test_x, test_y, test_qid = get_test_Data()

In [None]:
# Layer's sizes
x_size = vec_X.shape[1]   # Number of input nodes: 4 features and 1 bias
h_size = 600               # Number of hidden nodes in first hidden layer
h1_size = 400                # Number of hidden nodes in first hidden layer
h2_size = 200                # Number of hidden nodes in third hidden laye
h3_size  = 100
#h4_size = 50
beta= 0.01
y_size = len(vec_Y[0])   # Number of outcomes (3 iris flowers)

# Variable
biases_h = tf.Variable(tf.zeros([h_size]))
biases_h1 = tf.Variable(tf.zeros([h1_size]))
biases_h2 = tf.Variable(tf.zeros([h2_size]))
biases_h3 = tf.Variable(tf.random_normal([h3_size]))
#biases_h4 = tf.Variable(tf.random_normal([h4_size]))
biases_y = tf.Variable(tf.zeros([y_size]))


# Symbols
X = tf.placeholder("float", shape=[None, x_size])
y = tf.placeholder("float", shape=[None, y_size])
keep_prob = tf.placeholder(tf.float32)
learning_rate = tf.placeholder(tf.float32, shape=[])

# Weight initializations
w_1 = init_weights((x_size, h_size))
w_2 = init_weights((h_size, h1_size))
w_3 = init_weights((h1_size, h2_size))
w_4 = init_weights((h2_size, h3_size))
w_5 = init_weights((h3_size, y_size))
#w_6 = init_weights((h4_size, y_size))

# Forward propagation
yhat    = forwardprop(X, w_1, w_2, w_3, w_4, w_5, biases_h, biases_h1, biases_h2, biases_h3, biases_y)
confidence = tf.nn.softmax(yhat)
predict = tf.argmax(yhat, axis=1)

# Backward propagation
cost    = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=confidence) +
    beta*tf.nn.l2_loss(w_1) + beta*tf.nn.l2_loss(biases_h) +
    beta*tf.nn.l2_loss(w_2) + beta*tf.nn.l2_loss(biases_h1) + 
    beta*tf.nn.l2_loss(w_3) + beta*tf.nn.l2_loss(biases_h2) +
    beta*tf.nn.l2_loss(w_4) + beta*tf.nn.l2_loss(biases_y))
    

#updates = tf.train.AdamOptimizer(0.01).minimize(cost)
updates = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

In [60]:
batch_size = 200

### Running of Model

In [None]:
#c = np.c_[temp_vec_x.reshape(len(temp_vec_x), -1), temp_vec_y.reshape(len(temp_vec_y), -1)]
# Run SGD
sess = tf.Session()
init = tf.global_variables_initializer()

#Initlise values to Adaptive Learning Rate and keep_probability
# Set adaptive_lr to 1 if you want to use adaptive learning rate 
kp = 0.2

adaptive_lr = 0
lr = 0.0001
rep = 0
err_min1 = 0

# Creating saver
saver = tf.train.Saver()
sess.run(init)

for epoch in range(10):
    
    # New Permutation
    temp_vec_X,temp_vec_Y, tempvec_qid = unison_shuffled_copies(np.array(vec_X),np.array(vec_Y), np.array(vec_qid))

    # Train with each example
    for i in range(len(vec_X)):
        sess.run(updates, feed_dict={X: vec_X[i: i + batch_size], y: vec_Y[i: i + batch_size], learning_rate: lr, keep_prob: kp})
    
    train_predictions = sess.run(predict, feed_dict={X: vec_X, y: vec_Y, keep_prob: 1})
    train_accuracy = np.mean(np.argmax(vec_Y, axis=1) ==
                             train_predictions)
    train_pred_composition = collections.Counter(train_predictions)
    
    test_predictions = sess.run(predict, feed_dict={X: test_x, y: test_y, keep_prob: 1})
    test_accuracy  = np.mean(np.argmax(test_y, axis=1) ==
                             test_predictions)
    test_pred_composition = collections.Counter(test_predictions)


    comp = zip(list(np.argmax(test_y, axis=1)),list(test_predictions))
    print(comp[0:100])
    right = sum([1 if x[0] == 4 and x[1] == 4 else 0 for x in comp])
    wrong = sum([1 if (x[0] == 4 or x[1] == 4) and x[0] != x[1] else 0 for x in comp])
    
    print("Epoch = %d, train accuracy = %.2f%%, test accuracy = %.10f%%, 4 Label Accuracy = % .2f%%"
          % (epoch + 1, 100. * train_accuracy, 100. * test_accuracy, 100. * (right/(right+wrong))))

    print("Training Pred Composition")
    print([(x,train_pred_composition[x]) for x in range(5)])
    print("Test pred Composition")
    print([(x,test_pred_composition[x]) for x in range(5)])
    
    # Update Learning Rate
    if adaptive_lr == 1:
        err = (1 - train_accuracy)
        lr, rep = adaptiveLearningRate(lr, err_min1, err, rep)
        err_min1 = err
        print('error rate: ', err, 'lr: ', lr)
    else:
        lr = lr
    
# Save the variables to disk.
save_path = saver.save(sess, os.path.join(os.getcwd(), "model/Deep_MLP_NN.ckpt"))
print("Model saved to file")

sess.close()

### Post-procession
* Converting confidence values into predicted labels for evaluation

In [None]:
# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Later, launch the model, use the saver to restore variables from disk, and
# do some work with the model.
with tf.Session() as sess:
    
    # Restore variables from disk.
    saver.restore(sess, "model/Deep_MLP_NN.ckpt")
    print("Model restored.")
    print(predictions)      

In [None]:
# Getting confidence levels
#confi = list(np.max(raw_output,axis=1))
confi = list(np.max(predictions,axis=1))
# Getting actually label predictions
predictions = [list(x) for x in predictions]
pred_labels = [pred.index(max(pred)) for pred in predictions]

# Getting target labels
true_labels = list(np.argmax(test_y, axis=1))

# Get query ID's
test_qid = [int(x) for x in test_qid]

# Creating dataframe out model outputs
model_result = pd.DataFrame({"Query_ID": test_qid, "True_Label": true_labels,"Pred_Label": pred_labels, "Confidence": confi})

### Evaluate Model

In [None]:
# Sorting into ranked lists using predicted label and confidence
# sorted_predictions = model_result.sort_values(["Query_ID", "Pred_Label","Confidence"], ascending=False)
# sorted_predictions.index = range(sorted_predictions.shape[0])

# Saving the results to file
# sorted_predictions.to_csv("Data/Full_Deep_Youtube_Data/Model_Predictions.csv", index=False)

In [None]:
def rank_query(data, qid):
    """
    Description: Generates a list of the true labels, ranked by their expected relevance
    for the specified query
    
    Inputs:
        data: Pandas Dataframe with columns {qid, label_true, ERel}, grouped by qid
        qid: query id of query to return
    
    """
    
    data = data.get_group(qid)
    return data.sort_values('ERel', ascending=False).ix[:,'label_true']