In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('original_data/train.txt', sep=' ' , header=None)
validation = pd.read_csv('original_data/vali.txt', sep=' ' , header=None)
test = pd.read_csv('original_data/test.txt', sep=' ' , header=None)

# last column is nan
train.drop(train.columns[len(train.columns)-1], axis=1, inplace=True)
validation.drop(validation.columns[len(validation.columns)-1], axis=1, inplace=True)
test.drop(test.columns[len(test.columns)-1], axis=1, inplace=True)

train.iloc[:, 1:] = train.iloc[:, 1:].applymap(lambda s: float(s.split(':')[1]))
# train.to_csv('data/train_preprocessed.csv')

test.iloc[:, 1:] = test.iloc[:, 1:].applymap(lambda s: float(s.split(':')[1]))
# test.to_csv('data/test_preprocessed.csv')

validation.iloc[:, 1:] = validation.iloc[:, 1:].applymap(lambda s: float(s.split(':')[1]))
# validation.to_csv('data/validation_preprocessed.csv')

In [5]:
def preprocess(df, name,SS = None, cut_off = None):
    # change relevance to binary
#     df[0].replace(to_replace=[1,2,3], value = 0, inplace=True)
#     df[0].replace(to_replace=[4,5], value = 1, inplace=True)    
    
    # 0 = relevance, 1 = query_id
    avoid = [0,1]
    # these features are binary so don't standardize
    binary_feats = [97, 98, 99, 100, 101]
    # these are the remaining features that we will standardize
    cont_feats = [i for i in np.arange(len(list(train))) if i not in binary_feats and i not in avoid]
    
    # these are the ids for queries

    query_ids = set(df[1])
    keep_idx = []
    # for each query subsample 20 docs with at most 3 relevant
    for query_id in query_ids:
        # get dataframe indicies of all docs for one query
        query_idx = df.index[df[1] == query_id].tolist()
        #print(df.loc[query_idx][1], query_id)
        # query must have at least 20 docs and must have something relevant in it
        if np.sum(df.loc[query_idx][0]) > 0 and len(query_idx)>= 20 and (4 in list(df.loc[query_idx][0]) or 5 in list(df.loc[query_idx][0])):
            chosen_idx = np.random.choice(query_idx, 20, replace = False)
            # we are only chosing at most 3 relevant docs and there should be at least one relevant doc
            while np.sum(df.loc[chosen_idx][0]) == 0 or (4 not in list(df.loc[chosen_idx][0]) and 5 in list(df.loc[chosen_idx][0])):# or np.sum(df.loc[chosen_idx][0]) > 3:
                print('sampling', query_id, 'again which has', np.sum(np.sum(df.loc[query_idx][0])), 'relevant docs.')
                chosen_idx = np.random.choice(query_idx, 20, replace = False)
            keep_idx.extend(list(chosen_idx))
#         else:
#             if np.sum(np.sum(df.loc[query_idx][0])) == 0:
#                 print('qid', query_id, 'has no relevant docs!')
#             if len(query_idx) == 20:
#                 print('qid', query_id, 'did not have enough docs!')
    sampled_df = df.loc[keep_idx]
    
    if SS is None:
        SS = StandardScaler().fit(sampled_df[cont_feats])
    else:
        print('SS supplied')
    sampled_df[cont_feats] = SS.transform(sampled_df[cont_feats])
    
    # based on https://arxiv.org/pdf/1911.08054.pdf, we create binary groups based on this quality score
    # info about the quality score found here: https://www.microsoft.com/en-us/research/project/mslr/
    if cut_off is None:
        cut_off = np.percentile(sampled_df[134], 40, interpolation='lower')
    sampled_df['binary_group'] = sampled_df[134] >= cut_off
    # we do not train on 0 = relevance, 1 = query_id, and we throw out 133 = first quality score because we use second quality score
    valid_columns = [i for i in np.arange(138) if i not in [0,1,133]]
    
    X = np.array(sampled_df[valid_columns])
    y = np.array(sampled_df[0])
    group = np.array(sampled_df['binary_group'])

    np.save('data/X_{}.npy'.format(name), X)
    np.save('data/y_{}.npy'.format(name), y)
    np.save('data/group_{}.npy'.format(name), group)
    return SS, cut_off
    

In [6]:
SS, cut_off = preprocess(train.copy(), 'train', SS = None)

sampling 3082.0 again which has 9 relevant docs.
sampling 10879.0 again which has 15 relevant docs.
sampling 20641.0 again which has 7 relevant docs.
sampling 20641.0 again which has 7 relevant docs.
sampling 20641.0 again which has 7 relevant docs.
sampling 20641.0 again which has 7 relevant docs.
sampling 20641.0 again which has 7 relevant docs.
sampling 20641.0 again which has 7 relevant docs.
sampling 28102.0 again which has 11 relevant docs.


In [8]:
SS, cut_off = preprocess(test.copy(), 'test', SS = None)

sampling 24643.0 again which has 7 relevant docs.
sampling 1708.0 again which has 32 relevant docs.


In [None]:
SS, cut_off = preprocess(valid.copy(valid), 'test', SS = None)