# Construct Dataset with Negative Samples

Ratio of negative candidates to positive samples can be set with variable **NEGATIVE_SCALE**, samples are written into .csv files in ./data/csv/.

The final dataset generated process has another variable **NEGATIVE_EXPORT_SCALE** to determine the scale of negative samples picked from candidates.


In [1]:
'''
    Initialize data frames with .csv files
'''
import pandas as pd
import tqdm

csv_folder = './data/csv/'
document_csv_file = csv_folder + 'documents.csv'
training_csv_file = csv_folder + 'training.csv'
validation_csv_file = csv_folder + 'validation.csv'

document_pd = pd.read_csv(document_csv_file)
training_pd = pd.read_csv(training_csv_file)
validation_pd = pd.read_csv(validation_csv_file)

# Get ids of all documents
doc_id_list = list(document_pd['doc_id'])

In [2]:
'''
    Construct dataset with negative candidates, save to data frames
'''
# Max ratio of positive to negative samples
NEGATIVE_SCALE = 200

'''
    Return list of negative samples
'''
import random
idx_range = len(doc_id_list) - 1

def get_negative_ids(ids, num_new):
    new_ids = []
    ids = ids.copy()
    while len(new_ids) < num_new:
        idx:int
        while True:
            idx = doc_id_list[random.randint(0, idx_range)]
            if idx not in ids:
                break
        # Insert new idx to ids
        ids.append(idx)
        new_ids.append(idx)
    return new_ids

def get_df_with_negative(df):
    idx = 0
    for query in zip(df['query_id'], df['query_text'], df['query_label']):
        query_id = query[0]
        query_text = query[1]
        query_label = [int(id) for id in query[2].split()]
        positive_ids = [int(id) for id in query[2].split()]
        num_positive = len(positive_ids)
        negative_ids = get_negative_ids(query_label, num_positive * NEGATIVE_SCALE)
        df.loc[idx, 'negative_label'] = ' '.join([str(_) for _ in negative_ids])
        idx += 1
    return df

training_pd = get_df_with_negative(training_pd)
training_pd.to_csv(training_csv_file)
validation_pd = get_df_with_negative(validation_pd)
validation_pd.to_csv(validation_csv_file)

In [4]:
'''
    Export dataset with negative samples times by NEGATIVE_EXPORT_SCALE
    Used for feature engineering
'''
def export_csv(output_file, df):
    relevance_col = []
    query_id_col = []
    doc_id_col = []
    
    if NEGATIVE_EXPORT_SCALE is not 0:
        for query in zip(df['query_id'], df['query_text'], df['query_label'], df['negative_label']):
            query_id, query_text, query_label, negative_label = query
            query_label = query_label.split()
            negative_label = negative_label.split()
            num_positive = len(query_label)
            # Export positive samples, relevance ranges from 7 to 1
            for i in range(num_positive):
                relevance_col.extend([num_positive - i])
                query_id_col.extend([query_id])
                doc_id_col.extend([query_label[i]])
            # Negative samples relevance is 0
            for i in range(NEGATIVE_EXPORT_SCALE * num_positive):
                relevance_col.extend([0])
                query_id_col.extend([query_id])
                doc_id_col.extend([negative_label[i]])
    else:
        for query in zip(df['query_id'], df['query_text'], df['query_label']):
            query_id, query_text, query_label = query
            query_label = query_label.split()
            num_positive = len(query_label)
            # Export positive samples, relevance ranges from 7 to 1
            for i in range(num_positive):
                relevance_col.extend([num_positive - i])
                query_id_col.extend([query_id])
                doc_id_col.extend([query_label[i]])
    
    export_pd = pd.DataFrame({'relevance': relevance_col, 'query_id': query_id_col, 'doc_id': doc_id_col})
    export_pd.to_csv(output_file, index=False)

In [8]:
# Export training query doc pairs
NEGATIVE_EXPORT_SCALE = 30
csv_folder = './data/csv/'
training_csv_file = csv_folder + 'training.csv'
export_csv_file = csv_folder + 'export_training_' + str(NEGATIVE_EXPORT_SCALE) + '.csv'
training_pd = pd.read_csv(training_csv_file)
export_csv(export_csv_file, training_pd)

In [7]:
# Export validation query doc pairs
NEGATIVE_EXPORT_SCALE = 30
csv_folder = './data/csv/'
validation_csv_file = csv_folder + 'validation.csv'
export_csv_file = csv_folder + 'export_validation_' + str(NEGATIVE_EXPORT_SCALE) + '.csv'
validation_pd = pd.read_csv(validation_csv_file)
export_csv(export_csv_file, validation_pd)

In [16]:
pd.read_csv(export_csv_file)

Unnamed: 0,relevance,query_id,doc_id
0,1,1185869,0
1,0,1185869,2981666
2,0,1185869,1016027
3,0,1185869,4024789
4,0,1185869,4262969
...,...,...,...
993173,0,696607,5367825
993174,0,696607,641284
993175,0,696607,3511449
993176,0,696607,2556105
