### This file produces raw data for Table 2.

In [1]:
import os
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

In [2]:
# Get the current directory.
current_dir = os.getcwd()

# Set up random seeds for data splitting.
split_rs = [290, 150, 266, 78, 148, 133, 155, 135, 178, 241]

# Set up data source and hyperparameters. 
path = current_dir+'/Data14/'
template_length = 14
query_length = 14

number_of_pads, number_of_wells, number_of_stages, number_of_queries, all_test_well_names = [], [], [], [], []

for rs in split_rs:
    print('Calculating seed', rs)

    ## Splitting data
    files = sorted(os.listdir(path))
    pads = [elt[:-7] for elt in files]
    files_df = pd.DataFrame({'filename':files, 'pad':pads})

    num_wells_in_pad_df = pd.DataFrame(files_df['pad'].value_counts()).reset_index()
    num_wells_in_pad_df.columns = ['pad', 'count']
    unique_pads = np.unique(pads)
    unique_pads_df = pd.DataFrame({'pad':unique_pads})
    unique_pads_df = pd.merge(unique_pads_df, num_wells_in_pad_df, on='pad')

    np.random.seed(rs)
    unique_pads_df_shuffled = unique_pads_df.sample(frac=1).reset_index(drop=True)
    counter = 0
    for idx in range(len(unique_pads_df_shuffled)):
        counter += unique_pads_df_shuffled['count'][idx]
        if counter >= 300:
            break
        else:
            continue
    end_of_training = idx

    train_files_shuffled = []
    for idx in range(end_of_training+1):
        pad_name = unique_pads_df_shuffled['pad'][idx]
        for file in files:
            if file[:-7] == pad_name:
                train_files_shuffled.append(file)
            else:
                continue

    test_files_shuffled = []
    for idx in range(end_of_training+1,len(unique_pads_df_shuffled)):
        pad_name = unique_pads_df_shuffled['pad'][idx]
        for file in files:
            if file[:-7] == pad_name:
                test_files_shuffled.append(file)
            else:
                continue

    ## Calculating template and test pads.
    pads_in_test = []
    for i in range(len(test_files_shuffled)):
        well_name = test_files_shuffled[i]
        pads_in_test.append(well_name[:-7])
    number_of_pads.append(len(set(pads_in_test)))

    number_of_wells_this_rs, number_of_stages_this_rs, number_of_queries_this_rs = 0, 0, 0
    for i in range(len(test_files_shuffled)):
        temp_df = pd.read_excel(path+test_files_shuffled[i], header = 0, sheet_name = 0)
        all_test_well_names.append(test_files_shuffled[i])
        number_of_wells_this_rs += 1
        number_of_reopenings = temp_df['Mark'].value_counts()['reopening']
        number_of_stages_this_rs += number_of_reopenings
        number_of_queries_this_rs += (len(temp_df) - template_length*number_of_reopenings)
                    
    number_of_wells.append(number_of_wells_this_rs)
    number_of_stages.append(number_of_stages_this_rs)
    number_of_queries.append(number_of_queries_this_rs)

Calculating seed 290
Calculating seed 150
Calculating seed 266
Calculating seed 78
Calculating seed 148
Calculating seed 133
Calculating seed 155
Calculating seed 135
Calculating seed 178
Calculating seed 241


In [3]:
len(set(all_test_well_names))

180

In [4]:
stat = pd.DataFrame({'split_rs':split_rs, 
                    'number_of_pads':number_of_pads, 'number_of_wells':number_of_wells, 'number_of_stages':number_of_stages, 'number_of_queries':number_of_queries})
stat

Unnamed: 0,split_rs,number_of_pads,number_of_wells,number_of_stages,number_of_queries
0,290,6,29,92,8165
1,150,7,34,116,9762
2,266,7,34,134,11316
3,78,6,31,156,10969
4,148,7,33,168,14382
5,133,7,33,243,14569
6,155,7,34,215,15431
7,135,8,32,205,16077
8,178,6,34,263,21268
9,241,6,32,284,28573
