In [0]:
#File: prepare_EDR.py
#Name: Eric Schumacker
#Description: class to prepare EDR Data. 
#
#Use this class to abstract away all the data processing tasks. Moving into the modeling phase, 
#we want to take the training data as a given; and hide all the data processing through
#good encapsulation.

#This will eventually be placed in prepare_EDR.py; currently developing in ipynb for ease of
#data visualization.

import numpy as np
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler  #For clustering
from sklearn.preprocessing import MinMaxScaler    #For NLP
from sklearn.model_selection import train_test_split

class PrepareEDR:
  def __init__(self, path=None):
    if path != None:
    
      #input raw data. 
      with open(path, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        self.headers = next(reader)
        data_input = np.array(list(reader))
    data = data_input.astype(float)
    
    #Select features (from experimentation and final feature deceisions in preprocessing.ipynb)
  
    #Add a feature (Bit depth) / (Hole Depth) that serves as a ratio or a 'locator' of where we are 
    #during trip.
    depth_ratio = data[:,1]/data[:,0]
    dr_col = np.expand_dims(depth_ratio, axis=1)
    self.X = np.hstack((dr_col, data))
    self.headers.insert(0, "Bit Depth / Hole Depth")

    #Remove all negative values from Feature 7 (differential Pressure)
    diff = self.X[:,7]
    diff[diff<0] = 0
    self.X[:,7] = diff

    #Delete feature 10 (On Bottom Hours), as we don't want to keep that feature at all.
    self.X = np.delete(self.X, [10], 1)
    self.headers = [x for x in self.headers if x not in ["On Bottom Hours"]]

    #Note - from here, we want to keep the original data; and separately build the training data
    #That way we can provide he classifications to the original data and make sense of it.

    #Delete features 1 and 2 from the training data. 
    self.X_dr = np.delete(self.X, [1,2], 1)
    self.headers_dr = [x for x in self.headers if x not in ["Hole Depth", "Bit Depth"]]

    #Transform Block height feature into Block Movement
    block_height = np.zeros(self.X.shape[0])
    block_height[0] = 0
    dh = np.zeros(self.X.shape[0])
    dh[0] = 0
    for i in range(1, self.X.shape[0]):
      dh[i] = self.X[i,6] - self.X[i-1,6]
      if dh[i] < 1 and dh[i] > -1:
        block_height[i] = 0
      elif dh[i] > 1:
        block_height[i] = 1
      else:
        block_height[i] = -1

    self.X_dr[:,4] = block_height
    self.headers_dr[4] = "Block Movement: + / - / 0"

    #Remove outliers - get indices of outliers to remove.
    highWeightOutliers = np.where(self.X_dr[:,2] > 50)
    diffSpikeOutliers = np.where(self.X_dr[:,5] > 2400)



    #Don't combine the removal operations, but instead distinctly remove both sets of outliers. 
    #Do it this way for code readability and reuse. 
    self.X = np.delete(self.X, highWeightOutliers, 0)
    self.X = np.delete(self.X, diffSpikeOutliers, 0)
    self.X_dr = np.delete(self.X_dr, highWeightOutliers, 0)
    self.X_dr = np.delete(self.X_dr, diffSpikeOutliers, 0)

    time_estimate = range(1, len(self.X) + 1)
    self.X = np.append(self.X, np.expand_dims(time_estimate, axis=1), 1)
    self.headers.append("Time Sequence")

    


  def getOriginalData(self):
    return (self.X_dr, self.headers_dr)

  def getDrDataFrame(self):
    return pd.DataFrame(data = self.X_dr, columns = self.headers_dr)

  def getOriginalDF(self):
    print("Executed")
    return pd.DataFrame(data = self.X, columns = self.headers)

  def getClusteringTrainingData(self):
    X_dr_scaler = StandardScaler()
    X_train_dr = X_dr_scaler.fit_transform(self.X_dr)
    return (X_train_dr, self.headers_dr, X_dr_scaler)

  #Method: getLateralData:
  #This function returns drilling data to/from a target depth. 
  #In practice, this would be replaced by the real-time acquition of data. 
  #We will train the model on a given interval of data (for instance, the first 1,000 feet of the
  #lateral) as a recommendation system for target drilling parameters to maintain.
  #
  #Output is tuple of (X_train, y_train, X_test, y_test, scaler)
  #The scaler is fit to only the training data. 
  #It is returned so the user can perform further operations on the data.
  def getLateralData(self, start, end):

    #we are only interested in the depth domain specified
    depthRangeToDelete = np.where((self.X[:,2] < start) | (self.X[:,2] > end))
    lateralData = np.delete(self.X_dr, depthRangeToDelete, axis=0)

    #Move ROP to the end so that we can do inverse transforms easier.
    ROP_data = lateralData[:,7]
    lateralData_labelsLast = np.delete(lateralData, [7], axis=1)
    lateralData_labelsLast = np.concatenate((lateralData_labelsLast, np.expand_dims(ROP_data, axis=1)), axis=1)
    
    #Next, split into training and testing. 
    train_set, test_set = train_test_split(lateralData_labelsLast, test_size=0.2, random_state = 42)

    #Normalize the data prior to splitting up.
    #Later, return this Scaler so that end users can perform inverse transform.
    #Move ROP to the end
    lateral_data_scaler = MinMaxScaler()  #Default is a range of [0,1]
    train_set_std = lateral_data_scaler.fit_transform(train_set)
    test_set_std = lateral_data_scaler.transform(test_set)

    
    #Remove the ROP data - index #7 - as the training labels.
    y_train = train_set_std[:,9]
    y_test = test_set_std[:,9]
    X_train = np.delete(train_set_std, [9], 1)
    X_test = np.delete(test_set_std, [9], 1)
    
    
    #No further dimensionality reduction was performed in order to maintain uniformity
    #With initial clustering data. 
    #return X_train, y_train, X_test, y_test, X_headers, y_header, scaler
    return (X_train, y_train, X_test, y_test, np.delete(test_data.headers_dr, [7]), 
            ['ROP'], lateral_data_scaler)

  #helper function to create a lookback for time series prediction.
  def create_dataset(self, X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X[i:(i + time_steps)]
        Xs.append(v)
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)


  #Also get data over a depth interval, but prepare it for use with 
  #Keras time series layers. Also, to test, we are going to take the first
  #80% of the data as training data, and the last 20% of the data as testing data. 
  def getLateralTimeSeriesData(self, start, end):
    train_split = 0.2
    #we are only interested in the depth domain specified
    depthRangeToDelete = np.where((self.X[:,2] < start) | (self.X[:,2] > end))
    lateralData = np.delete(self.X_dr, depthRangeToDelete, axis=0)

    #Move ROP to the end so that we can do inverse transforms easier.
    ROP_data = lateralData[:,7]
    lateralData_labelsLast = np.delete(lateralData, [7], axis=1)
    lateralData_labelsLast = np.concatenate((lateralData_labelsLast, np.expand_dims(ROP_data, axis=1)), axis=1)
    
    #Next, split into training and testing. 
    train_set = lateralData_labelsLast[0:int(len(lateralData_labelsLast)*train_split),]
    test_set = lateralData_labelsLast[int(len(lateralData_labelsLast)*train_split):,]

    #Normalize the data prior to splitting up.
    #Later, return this Scaler so that end users can perform inverse transform.
    #Move ROP to the end
    lateral_data_scaler = MinMaxScaler()  #Default is a range of [0,1]
    train_set_std = lateral_data_scaler.fit_transform(train_set)
    test_set_std = lateral_data_scaler.transform(test_set)

    #Remove the ROP data - index #7 - as the training labels.
    y_train = train_set_std[:,9]
    y_test = test_set_std[:,9]
    X_train = np.delete(train_set_std, [9], 1)
    X_test = np.delete(test_set_std, [9], 1)

    #Reshape the X and y arrays to have their "lookbacks"
    #This will reshape the matrix to [samples, time_steps back, n_features]
    #This is the required format for input into LSTM.
    time_steps = 20
    X_train_reshape, y_train_reshape = self.create_dataset(X_train, y_train, time_steps)
    X_test_reshape, y_test_reshape = self.create_dataset(X_test, y_test, time_steps)


    
    
    #No further dimensionality reduction was performed in order to maintain uniformity
    #With initial clustering data.
    #Note that we are also passing the original test data, so the client does not have to 
    #concatenate matrices and inverse transform, but instead can directly compare results.
    #A cleaner approach would be to write a separate method that can do this inverse_transform
    #and results comparison / appraisal.
    #return X_train, y_train, X_test, y_test, X_test_orig, y_test_orig, 
    #X_headers, y_header, scaler
    return (X_train_reshape, y_train_reshape, X_test_reshape, y_test_reshape, 
            X_test,
            np.delete(test_data.headers_dr, [7]), ['ROP'], lateral_data_scaler)



      
      




In [0]:
####Below this is testing of the class methods; before actual implementation in clustering.ipynb 
#and MLP.ipynb

from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
test_data = PrepareEDR("/content/gdrive/My Drive/COMP 642/proj/well_1.csv")

orig_data, orig_headers = test_data.getOriginalData()

Mounted at /content/gdrive/




In [0]:
print(orig_data)
print(orig_headers)
print(test_data.X_dr.shape)
print(test_data.X.shape)

[[7.60396040e-02 5.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [7.60396040e-02 5.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [7.60396040e-02 5.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [9.99026204e-01 8.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  8.54400000e+02 0.00000000e+00]
 [9.99026204e-01 8.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  5.05610000e+02 0.00000000e+00]
 [9.99026204e-01 8.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  1.29590000e+02 0.00000000e+00]]
['Bit Depth / Hole Depth', 'Rotary RPM', 'Weight on Bit', 'Total Pump Output', 'Block Movement: + / - / 0', 'Differential Pressure', 'Hook Load', 'On Bottom ROP', 'Standpipe Pressure', 'Convertible Torque']
(28583, 10)
(28583, 13)


In [0]:
test_data.getOriginalDF()

Executed


Unnamed: 0,Bit Depth / Hole Depth,Hole Depth,Bit Depth,Rotary RPM,Weight on Bit,Total Pump Output,Block Height,Differential Pressure,Hook Load,On Bottom ROP,Standpipe Pressure,Convertible Torque,Time Sequence
0,0.076040,2525.0,192.0,0.05,0.0,0.00,12.0,0.00,46.8,0.0,0.00,0.0,1.0
1,0.076040,2525.0,192.0,0.05,0.0,0.00,33.8,0.00,48.4,0.0,0.00,0.0,2.0
2,0.076040,2525.0,192.0,0.05,0.0,0.00,39.4,0.00,48.9,0.0,0.00,0.0,3.0
3,0.076040,2525.0,192.0,0.05,0.0,0.00,39.1,0.00,55.5,0.0,0.00,0.0,4.0
4,0.076040,2525.0,192.0,0.05,0.0,0.00,30.4,0.00,53.1,0.0,0.00,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28578,0.999026,21565.1,21544.1,0.08,0.0,328.19,47.0,233.41,47.8,0.0,3432.08,0.0,28579.0
28579,0.999026,21565.1,21544.1,0.08,0.0,107.91,47.0,0.00,47.8,0.0,2693.37,0.0,28580.0
28580,0.999026,21565.1,21544.1,0.08,0.0,0.00,47.0,0.00,47.8,0.0,854.40,0.0,28581.0
28581,0.999026,21565.1,21544.1,0.08,0.0,0.00,47.0,0.00,47.8,0.0,505.61,0.0,28582.0


In [0]:
X_train, y_train, X_test, y_test, headers_X, headers_y, data_scaler = test_data.getLateralData(13000, 15000)

In [0]:
#Examine training and testing data prior to standardization
pd.DataFrame(data = X_train, columns = headers_X)

Unnamed: 0,Bit Depth / Hole Depth,Rotary RPM,Weight on Bit,Total Pump Output,Block Movement: + / - / 0,Differential Pressure,Hook Load,Standpipe Pressure,Convertible Torque
0,1.000000,0.447274,0.527233,0.959577,0.5,0.139383,0.597836,0.815550,0.352963
1,1.000000,0.405846,0.366013,0.961277,0.5,0.046636,0.569838,0.811857,0.282137
2,1.000000,0.980269,0.760349,0.957904,0.0,0.777070,0.544066,0.967077,0.778672
3,1.000000,0.500327,0.305011,0.959604,0.0,0.231672,0.660834,0.836271,0.543863
4,0.999773,0.029147,0.000000,0.000000,1.0,0.000000,0.018772,0.000000,0.115532
...,...,...,...,...,...,...,...,...,...
946,1.000000,0.405846,0.442266,0.959604,0.5,0.119701,0.572065,0.818266,0.261024
947,1.000000,0.406091,0.370370,0.959604,0.5,0.161605,0.587973,0.821984,0.287052
948,0.009622,0.000000,0.000000,0.000000,0.5,0.000000,0.050270,0.000000,0.000000
949,1.000000,0.406091,0.370370,0.961277,0.5,0.122038,0.583201,0.821295,0.355210


In [0]:
pd.DataFrame(data = y_train, columns = headers_y)

Unnamed: 0,ROP
0,0.112360
1,0.044134
2,0.799578
3,0.270631
4,0.000000
...,...
946,0.078392
947,0.041861
948,0.000000
949,0.034692


In [0]:
pd.DataFrame(data = X_test, columns = headers_X)

Unnamed: 0,Bit Depth / Hole Depth,Rotary RPM,Weight on Bit,Total Pump Output,Block Movement: + / - / 0,Differential Pressure,Hook Load,Standpipe Pressure,Convertible Torque
0,1.000000,0.447765,0.529412,0.957904,0.5,0.174758,0.594018,0.823493,0.365509
1,0.105886,0.000000,0.000000,0.000000,0.5,0.000000,0.028635,0.000000,0.000000
2,0.999726,0.000246,0.000000,0.000000,1.0,0.000000,0.012409,0.000000,0.000000
3,1.000000,0.446864,0.555556,0.952858,0.5,0.157697,0.597200,0.806804,0.287567
4,1.000000,0.405846,0.200436,0.957904,0.5,0.143561,0.574292,0.811722,0.292154
...,...,...,...,...,...,...,...,...,...
233,1.000000,0.980269,0.607843,0.957904,0.0,0.669794,0.676106,0.942992,0.617077
234,0.999653,0.168823,0.000000,0.000000,1.0,0.000000,0.013999,0.000000,0.116328
235,1.000000,0.501474,0.342048,0.954558,0.5,0.104444,0.668470,0.800241,0.575508
236,0.999865,0.406091,0.050109,0.961277,0.5,0.000000,0.651925,0.781802,0.290048


In [0]:
pd.DataFrame(data = y_test, columns = headers_y)

Unnamed: 0,ROP
0,0.132981
1,0.000000
2,0.000000
3,0.125811
4,0.063453
...,...
233,0.522129
234,0.000000
235,0.086264
236,0.000000


In [0]:
#Now, check the time series data. 

X_train_ts, y_train_ts, X_test_ts, y_test_ts, headers_X, headers_y, data_scaler = test_data.getLateralTimeSeriesData(13000, 15000)