<div align="center"><a href="https://colab.research.google.com/github/deepkapha/EarthScanWebinar/blob/main/notebook/W2W_dataset_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a></div>

In [None]:
import sys 
sys.path.append('..')

# 1. Import the necessaary libraries

In [None]:
import os
import csv
import gdown
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import matplotlib.colors as mcolors
# from PIL import Image
from os.path import join as pjoin
from utils.well_log_plots import log_plots
from sklearn.preprocessing import LabelEncoder
from utils import preprocessing, plot_utils

In [None]:
pd.set_option('display.max_columns', 500)
overlap = {name for name in mcolors.CSS4_COLORS
           if f'xkcd:{name}' in mcolors.XKCD_COLORS}

# 2. Download the dataset from publicly avalialable google drive link

In [None]:
train_data_url = 'https://drive.google.com/file/d/1TkRT5TeX7slPDL20OqYqFLhGX5iK4RHc/view?usp=share_link'
test_data_url = 'https://drive.google.com/file/d/1ooDiCKweUrduyy0Q1P7KIXnl04BQqH0_/view?usp=share_link'

data_root = '../raw_data'
os.makedirs(data_root, exist_ok = True)
train_path = pjoin(data_root, "train.csv")
test_path = pjoin(data_root, "test.csv")

In [None]:
if not os.path.isfile(train_path):
    gdown.download(url=train_data_url, output=train_path, quiet=False, fuzzy=True)

if not os.path.isfile(train_path):
    gdown.download(url=test_data_url, output=test_path, quiet=False, fuzzy=True)

# 3. 

In [None]:
start_well=30
end_well=98

In [None]:
well_train = preprocessing.load_data(train_path)
well_test = preprocessing.load_data(test_path)

In [None]:
print("Shape of Well data:\n\tTraining Well: {}\n\tTesting Well: {}".format(well_train.shape, well_test.shape))

# Description of dataset

The provided dataset contains well logs, interpreted lithofacies and lithostratigraphy for 90+ released wells from offshore Norway. The well logs include the well name (WELL), the measured depth, x,y,z location for the wireline measurement as well as the well logs CALI, RDEP, RHOB, DHRO, SGR,  GR, RMED, RMIC, NPHI, PEF, RSHA, DTC, SP, BS, ROP, DTS, DCAL, MUDWEIGHT. An explanation of the abbreviations is shown in the figure below. 

<center>
<img src="../asset/well_log_abbreviations.png">
</center>

Click [here](https://drive.google.com/drive/folders/1GIkjq4fwgwbiqVQxYwoJnOJWVobZ91pL) to check the results.

Given below is the Statistical description of the few important logs from the raw_data


| Log Name | min_scale | max_scale | description |
|----------|-----------|-----------|-------------|
| CALI     |  6        |   24      |             |
| GR       |  0        |   150     |             |
| SP       |  -150     |   150     |             |
| SGR      |  0        |   150     |             |
| RSHA     |  0.2      |   200     |             |
| RMED     |  0.2      |   200     |             |
| RDEP     |  0.2      |   200     |             |
| RMIC     |  0.2      | 200 ohm-m |             |
| RXO      |  0.2      | 200 ohm-m |             |                 
| ROPA     |   0       |    50     |             |
| ROP      |   0       |    50     |             |
| DTC      |   40       |  240      |             |
| DTS      |   40       |  240      |             |
| NPHI     |   0.05    |  -0.15    |             |
| RHOB     |   0.95    |   2.95    |             |
| PEF      |  0        |10 barns/electron|       |             
| DCAL     |  6        |    24     |             |
| DRHO     |  -0.2     |   1       |             |
|MUDWEIGHT |    0       |  150         |             |
|BS        |     6      |    24       |             |

In [None]:
total_training_wells = preprocessing.count_well(well_train)
total_testing_wells = preprocessing.count_well(well_test)

print('Total number of wells present in the dataset:\n\tTraining Well: {}\n\tTesting Well: {}'.format(total_training_wells, total_testing_wells))

In [None]:
print(f"Unique wells present in the dataset are:\n\t{well_train.WELL.unique()}")

In [None]:
well_train_names = preprocessing.get_well_names(well_train)
well_test_names = preprocessing.get_well_names(well_test)

In [None]:
overlapped_well_count, overlapped_well_name = preprocessing.get_overlapping_well(well_train, well_test)

In [None]:
well_train.head()

In [None]:
print("Max mud weight value is",np.max(well_train["MUDWEIGHT"]))
print("Max mud weight value is",np.min(well_train["MUDWEIGHT"]))

In [None]:
well_test.head()

In [None]:
preprocessing.get_random_well(well_train, 2020).head()

In [None]:
percet_missing_train_data = preprocessing.percet_missing_data(well_train)
percet_missing_test_data = preprocessing.percet_missing_data(well_test)

_, ax = plt.subplots(1, 2, figsize = (20, 4))
percet_missing_train_data.plot(kind='bar', ax=ax[0])
percet_missing_test_data.plot(kind='bar', ax=ax[1])
plt.show()

In [None]:
#preprocessing.remove_column_with_half_of_nan_value(well_train)
#preprocessing.remove_column_with_half_of_nan_value(well_test)

In [None]:
training_group_names = preprocessing.group_identification(well_train)
print()
testing_group_names = preprocessing.group_identification(well_test)

In [None]:
nonoverlapped_groups_count, nonoverlapped_groups_name = preprocessing.get_nonoverlapping_groups(training_group_names, testing_group_names)

In [None]:
preprocessing.missing_group_info(well_train)
preprocessing.missing_group_info(well_test)

In [None]:
preprocessing.remove_formation_column(well_train)
preprocessing.remove_formation_column(well_test)

In [None]:
well_train.head()

In [None]:
well_test.head()

In [None]:
preprocessing.get_random_well(well_train, 2023).head()

In [None]:
# undeviated_wells = preprocessing.get_undeviated_well_info(well_train)

In [None]:
rand_well = preprocessing.get_random_well(well_train)
plot_utils.plot2Dlocation(rand_well)
plot_utils.plot3Dlocation(rand_well)

In [None]:
well_train_with_missing_group_info = preprocessing.get_well_with_no_group_info(well_train, total_training_wells, well_train_names)
well_test_with_missing_group_info = preprocessing.get_well_with_no_group_info(well_test, total_testing_wells, well_test_names)

In [None]:
well_train.GROUP.loc[well_train[well_train['WELL'] == well_train_with_missing_group_info[0]].GROUP.index] = preprocessing.fill_group_na_value(well_train, well_train_with_missing_group_info[0])
well_train.GROUP.loc[well_train[well_train['WELL'] == well_train_with_missing_group_info[1]].GROUP.index] = preprocessing.fill_group_na_value(well_train, well_train_with_missing_group_info[1])
well_train.GROUP.loc[well_train[well_train['WELL'] == well_train_with_missing_group_info[2]].GROUP.index] = preprocessing.fill_group_na_value(well_train, well_train_with_missing_group_info[2])
well_train.GROUP.loc[well_train[well_train['WELL'] == well_train_with_missing_group_info[3]].GROUP.index] = preprocessing.fill_group_na_value(well_train, well_train_with_missing_group_info[3])

In [None]:
preprocessing.percet_missing_data(well_train)

In [None]:
log_plot(well_train[well_train['WELL'] == '15/9-13'])

In [None]:
def log_plot_image(logs,plotname,txtname,i,patch_height):
    _, ax = plt.subplots(1,19, figsize = (20, 10), sharey = True, gridspec_kw = {'wspace':0, 'hspace':0})
    ax[0].invert_yaxis()
    ax[1].invert_yaxis()
    ax[2].invert_yaxis()
    ax[3].invert_yaxis()
    ax[4].invert_yaxis()
    ax[5].invert_yaxis()
    ax[6].invert_yaxis()
    ax[7].invert_yaxis()
    ax[8].invert_yaxis()
    ax[9].invert_yaxis()
    ax[10].invert_yaxis()
    ax[11].invert_yaxis()
    ax[12].invert_yaxis()
    ax[13].invert_yaxis()
    ax[14].invert_yaxis()
    ax[15].invert_yaxis()
    ax[16].invert_yaxis()
    ax[17].invert_yaxis()
    ax[18].invert_yaxis()


    ax[0].plot(logs.CALI[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[0].set_xlim(6, 24)
    ax[0].axis('off')
    ax[1].plot(logs.GR[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[1].set_xlim(0, 150)
    ax[1].axis('off')
    ax[2].plot(logs.SP[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[2].set_xlim(-150, 150)
    ax[2].axis('off')
    ax[3].plot(logs.SGR[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[3].set_xlim(0, 150)
    ax[3].axis('off')
    ax[4].semilogx(logs.RSHA[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[4].set_xlim(2, 200)
    ax[4].axis('off')
    ax[5].semilogx(logs.RMED[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[5].set_xlim(2, 200)
    ax[5].axis('off')
    ax[6].semilogx(logs.RDEP[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[6].set_xlim(2, 200)
    ax[6].axis('off')
    ax[7].semilogx(logs.RXO[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[7].set_xlim(2, 200)
    ax[7].axis('off')
    ax[8].semilogx(logs.RMIC[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[8].set_xlim(2, 200)
    ax[8].axis('off')
    ax[9].plot(logs.NPHI[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[9].set_xlim(-0.15, 1.05)
    ax[9].invert_xaxis()
    ax[9].axis('off')
    ax[10].plot(logs.RHOB[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[10].set_xlim(0.95, 2.95)
    ax[10].invert_xaxis()
    ax[10].axis('off')
    ax[11].plot(logs.PEF[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[11].set_xlim(0, 10)
    ax[11].axis('off')
    ax[12].plot(logs.ROP[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[12].set_xlim(0, 50)
    ax[12].axis('off')
    ax[13].plot(logs.ROPA[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[13].set_xlim(0, 50)
    ax[13].axis('off')
    ax[14].plot(logs.DRHO[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[14].set_xlim(-0.2, 1)
    ax[14].axis('off')
    ax[15].plot(logs.DTC[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[15].set_xlim(40, 240)
    ax[15].invert_xaxis()
    ax[15].axis('off')
    ax[16].plot(logs.DTS[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[16].set_xlim(40, 240)
    ax[16].invert_xaxis()
    ax[16].axis('off')
    ax[17].plot(logs.MUDWEIGHT[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[17].set_xlim(0, 150)
    ax[17].axis('off')
    ax[18].plot(logs.BS[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[18].set_xlim(6, 24)
    ax[18].axis('off')
    
     
    plt.savefig(plotname,bbox_inches ="tight",transparent = False)
    #im = Image.open(plotname)
    #newsize = (800, 360)
    #im = im.resize(newsize)
    #im =im.save(plotname)

    with open(txtname, 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
    # write a row to the csv file
        writer.writerow(logs.GROUP[i:i+patch_height][:-1].T)
        f.close()

In [None]:

def randon_list_generator():
    randomlist = []
    for i in range(0,50):
        n = random.randint(0,18)
        if n not in randomlist:
            randomlist.append(n)
    if len(randomlist) == 19: 
        return randomlist
    else:
        return randon_list_generator()
    

randomlist = randon_list_generator()
print(randomlist)


In [None]:
def log_plot_image_random(logs,plotname,txtname,i,patch_height,randomlist):
    _, ax = plt.subplots(1,19, figsize = (20, 10), sharey = True, gridspec_kw = {'wspace':0, 'hspace':0})
    j=0
    for i in randomlist:
        op= log_plots[i]
        op(well_train[well_train['WELL'] == well_train_names[0]],ax[j],0,700)
        j=j+1
    plt.savefig(plotname,bbox_inches ="tight",transparent = False)
    #im = Image.open(plotname)
    #newsize = (800, 360)
    #im = im.resize(newsize)
    #im =im.save(plotname)

    with open(txtname, 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
        # write a row to the csv file
        writer.writerow(logs.GROUP[i:i+patch_height])
        f.close()

In [None]:
def log_plot_image_invert(logs,plotname,txtname,i,patch_height):
    _, ax = plt.subplots(1,19, figsize = (20, 10), sharey = True, gridspec_kw = {'wspace':0, 'hspace':0})



    ax[0].plot(logs.CALI[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[0].set_xlim(6, 24)
    ax[0].axis('off')
    ax[1].plot(logs.GR[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[1].set_xlim(0, 150)
    ax[1].axis('off')
    ax[2].plot(logs.SP[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[2].set_xlim(-150, 150)
    ax[2].axis('off')
    ax[3].plot(logs.SGR[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[3].set_xlim(0, 150)
    ax[3].axis('off')
    ax[4].semilogx(logs.RSHA[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[4].set_xlim(2, 200)
    ax[4].axis('off')
    ax[5].semilogx(logs.RMED[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[5].set_xlim(2, 200)
    ax[5].axis('off')
    ax[6].semilogx(logs.RDEP[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[6].set_xlim(2, 200)
    ax[6].axis('off')
    ax[7].semilogx(logs.RXO[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[7].set_xlim(2, 200)
    ax[7].axis('off')
    ax[8].semilogx(logs.RMIC[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[8].set_xlim(2, 200)
    ax[8].axis('off')
    ax[9].plot(logs.NPHI[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[9].set_xlim(-0.15, 1.05)
    ax[9].invert_xaxis()
    ax[9].axis('off')
    ax[10].plot(logs.RHOB[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[10].set_xlim(0.95, 2.95)
    ax[10].invert_xaxis()
    ax[10].axis('off')
    ax[11].plot(logs.PEF[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[11].set_xlim(0, 10)
    ax[11].axis('off')
    ax[12].plot(logs.ROP[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[12].set_xlim(0, 50)
    ax[12].axis('off')
    ax[13].plot(logs.ROPA[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[13].set_xlim(0, 50)
    ax[13].axis('off')
    ax[14].plot(logs.DRHO[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[14].set_xlim(-0.2, 1)
    ax[14].axis('off')
    ax[15].plot(logs.DTC[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[15].set_xlim(40, 240)
    ax[15].invert_xaxis()
    ax[15].axis('off')
    ax[16].plot(logs.DTS[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[16].set_xlim(40, 240)
    ax[16].invert_xaxis()
    ax[16].axis('off')
    ax[17].plot(logs.MUDWEIGHT[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[17].set_xlim(0, 150)
    ax[17].axis('off')
    ax[18].plot(logs.BS[i:i+patch_height], list(range(i,i+patch_height)), 'b')
    ax[18].set_xlim(6, 24)
    ax[18].axis('off')
    
     
    plt.savefig(plotname,bbox_inches ="tight",transparent = False)
    #im = Image.open(plotname)
    #newsize = (800, 360)
    #im = im.resize(newsize)
    #im =im.save(plotname)

    with open(txtname, 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
    # write a row to the csv file
        writer.writerow(logs.GROUP[i:i+patch_height])
        f.close()

In [None]:
count =0
for j in range(start_well,end_well,1):
        well_shape = len(well_train[well_train['WELL'] == well_train_names[j]])
        
        for i in tqdm(range(0, well_shape, 350)):
                plotname = "well-images/well_"+str(j)+"_"+str(i)+".jpg"
                txtname = "well-images/well_"+str(j)+"_"+str(i)+".csv"
                
                if i+700 > well_shape:
                        well_not700 = well_shape-i
                        #print(i, well_not700+i ,i-(700-well_not700), i-(700-well_not700)+700)
                        log_plot_image(well_train[well_train['WELL'] == well_train_names[0]],plotname,txtname,i-(700-well_not700),700)
                else:
                        log_plot_image(well_train[well_train['WELL'] == well_train_names[0]],plotname,txtname,i,700)
                
                
                        
                count = count+1
print(count)

In [None]:
count =0
for j in range(start_well,end_well,1):
        well_shape = len(well_train[well_train['WELL'] == well_train_names[j]])
        
        for i in tqdm(range(0, well_shape, 350)):
                plotname = "well-images/random_well_"+str(j)+"_"+str(i)+".jpg"
                txtname = "well-images/random_well_"+str(j)+"_"+str(i)+".csv"
                randomlist = randon_list_generator()
                if i+700 > well_shape:
                        well_not700 = well_shape-i
                        #print(i, well_not700+i ,i-(700-well_not700), i-(700-well_not700)+700)
                        log_plot_image_random(well_train[well_train['WELL'] == well_train_names[0]],plotname,txtname,i-(700-well_not700),700, randomlist)
                else:
                        log_plot_image_random(well_train[well_train['WELL'] == well_train_names[0]],plotname,txtname,i,700, randomlist)
                
                
                        
                count = count+1
print(count)

In [None]:
count =0
for j in range(start_well,end_well,1):
        well_shape = len(well_train[well_train['WELL'] == well_train_names[j]])
        
        for i in tqdm(range(0, well_shape, 350)):
                plotname = "well-images/invert_well_"+str(j)+"_"+str(i)+".jpg"
                txtname = "well-images/invert_well_"+str(j)+"_"+str(i)+".csv"
                
                if i+700 > well_shape:
                        well_not700 = well_shape-i
                        #print(i, well_not700+i ,i-(700-well_not700), i-(700-well_not700)+700)
                        log_plot_image_invert(well_train[well_train['WELL'] == well_train_names[0]],plotname,txtname,i-(700-well_not700),700)
                else:
                        log_plot_image_invert(well_train[well_train['WELL'] == well_train_names[0]],plotname,txtname,i,700)
                
                
                        
                count = count+1
print(count)