In [1]:
%reload_ext autoreload
import sys
import os
sys.path.append(os.path.abspath("../../ucl_irdm2017_project2_group1"))

from ltr.data_load import make_rank_data_csv
import ltr.dnn_utils
import ltr.evals

import pandas as pd
import numpy as np

from itertools import combinations
from collections import Counter
import tensorflow as tf 
from IPython.core.display import clear_ouptput
import time

# Data Preprocessing

##### - Load in the data and write to csvs
##### - Take in the features and normalise, scaling from 0-1
##### - Remove outlier querys with very high or low associated documents

In [6]:
# Specify the fold from the MSLR-10K dataset you wish to import 
fpath = '../../input/'
fold_no = 1
dataset = ['train', 'vali', 'test']

In [4]:
train = make_rank_data_csv(fpath, fold_no, 'train')
vali = make_rank_data_csv(fpath, fold_no, 'vali')
test = make_rank_data_csv(fpath, fold_no, 'test')

In [8]:
# Make a dataset of all data for normalisation and reset indices
full_data = pd.concat([train,vali,test])
full_data.index = range(full_data.shape[0])

In [9]:
# Get list of unique query ids 
unique_qry = full_data["query_id"].unique()

In [None]:
# For each q_id, count the number of doucments 
num_docs= []
for i in unique_qry:
    num_docs.append(full_data[full_data['query_id']==i].shape[0])

In [None]:
# Find stats about features in order to normalise
mean_params = []
max_params = []
min_parms = []

for q_id in unique_qry:
    query = full_data[full_data['query_id'] == q_id].drop(['label', 'query_id'], axis=1)
    average = list(query.mean())
    max_values = list(query.max())
    min_values = list(query.min())
    mean_params.append([q_id] + average)
    max_params.append([q_id] + max_values)

In [None]:
print(average, max_values, min_values, mean_params, max_params)

In [None]:
mean_cleaned = pd.DataFrame(mean_params)
mean_cleaned.columns = ["query_id"] + ["mean_" + col for col in cleaned.columns[3:]]

max_cleaned = pd.DataFrame(max_params)
max_cleaned.columns = ["query_id"] + ["max_" + col for col in cleaned.columns[3:]]

min_cleaned = pd.DataFrame(min_params)
min_cleaned.columns = ["query_id"] + ["m_" + col for col in cleaned.columns[3:]]

## Normalising the features by query parititons 

In [None]:
norm_features = []

for index,row in enumerate(cleaned.iterrows()):
    
    # Getting Query ID
    q_id = row[1]['query_id']
    
    # Normalisation formula: 2*(x - min)/(max - min) - 1
    norm_row = np.array(2*((np.array(row[1][3:]) - \
    np.array(min_cleaned[min_cleaned["query_id"]==q_id].drop(["query_id"],axis=1))[0])) / \
    ((np.array(max_cleaned[max_cleaned["query_id"] == q_id].drop(["query_id"],axis=1))[0]) - \
    (np.array(min_cleaned[min_cleaned["query_id"] == q_id].drop(["query_id"],axis=1))[0]))-([1]*136))
    
    # Nans indicate division by zero, which means max == min, so setting to zero, Naive fix
    norm_row[np.isnan(norm_row)] = 0.0
    norm_features.append([q_id] + list(norm_row))
    if index%10000 == 0:
        print(index)

## Create pandas dataframe from normalised data, adding label column

In [None]:
norm_cleaned = pd.DataFrame(norm_features)

In [None]:
norm_cleaned.insert(0,'label', cleaned['label'])
norm_cleaned.insert(0,'Unnamed: 0', cleaned['Unnamed: 0'])
norm_cleaned.columns = cleaned.columns
del norm_cleaned['Unnamed: 0']

In [None]:
a = [0, 1, 2, 3, 4, 5, 6, 7]
a[0:3]

## Splitting data back into train / validation / test sets 

In [None]:
clean_train = norm_cleaned.iloc[0:723412]
clean_val = norm_cleaned.iloc[723412:723412+235259]
clean_test = norm_cleaned.iloc[723412+235259:]

### Getting Filter Keys

### Saving splits 

In [None]:
clean_train_filtered.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_train_filtered_fld1.csv", index=False)
clean_val_filtered.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_vali_filtered_fld1.csv", index=False)

clean_train.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_train_fld1.csv", index=False)
clean_val.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_vali_fld1.csv", index=False)
clean_test.to_csv("Data/Full_Deep_Youtube_Data/normalised_mslr_test_fld1.csv", index=False)

### 