In [0]:
#File: prepare_EDR.py
#Name: Eric Schumacker
#Description: class to prepare EDR Data. 
#
#Use this class to abstract away all the data processing tasks. Moving into the modeling phase, 
#we want to take the training data as a given; and hide all the data processing through
#good encapsulation.

#This will eventually be placed in prepare_EDR.py; currently developing in ipynb for ease of
#data visualization.

import numpy as np
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler

class PrepareEDR:
  def __init__(self, path=None):
    if path != None:
    
      #input raw data. 
      with open(path, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        self.headers = next(reader)
        data_input = np.array(list(reader))
    data = data_input.astype(float)
    
    #Select features (from experimentation and final feature deceisions in preprocessing.ipynb)
  
    #Add a feature (Bit depth) / (Hole Depth) that serves as a ratio or a 'locator' of where we are 
    #during trip.
    depth_ratio = data[:,1]/data[:,0]
    dr_col = np.expand_dims(depth_ratio, axis=1)
    self.X = np.hstack((dr_col, data))
    self.headers.insert(0, "Bit Depth / Hole Depth")

    #Remove all negative values from Feature 7 (differential Pressure)
    diff = self.X[:,7]
    diff[diff<0] = 0
    self.X[:,7] = diff

    #Delete feature 10 (On Bottom Hours), as we don't want to keep that feature at all.
    self.X = np.delete(self.X, [10], 1)
    self.headers = [x for x in self.headers if x not in ["On Bottom Hours"]]

    #Note - from here, we want to keep the original data; and separately build the training data
    #That way we can provide he classifications to the original data and make sense of it.

    #Delete features 1 and 2 from the training data. 
    self.X_dr = np.delete(self.X, [1,2], 1)
    self.headers_dr = [x for x in self.headers if x not in ["Hole Depth", "Bit Depth"]]

    #Transform Block height feature into Block Movement
    block_height = np.zeros(self.X.shape[0])
    block_height[0] = 0
    dh = np.zeros(self.X.shape[0])
    dh[0] = 0
    for i in range(1, self.X.shape[0]):
      dh[i] = self.X[i,6] - self.X[i-1,6]
      if dh[i] < 1 and dh[i] > -1:
        block_height[i] = 0
      elif dh[i] > 1:
        block_height[i] = 1
      else:
        block_height[i] = -1

    self.X_dr[:,4] = block_height
    self.headers_dr[4] = "Block Movement: + / - / 0"

    #Remove outliers - get indices of outliers to remove.
    highWeightOutliers = np.where(self.X_dr[:,2] > 50)
    diffSpikeOutliers = np.where(self.X_dr[:,5] > 2400)



    #Don't combine the removal operations, but instead distinctly remove both sets of outliers. 
    #Do it this way for code readability and reuse. 
    self.X = np.delete(self.X, highWeightOutliers, 0)
    self.X = np.delete(self.X, diffSpikeOutliers, 0)
    self.X_dr = np.delete(self.X_dr, highWeightOutliers, 0)
    self.X_dr = np.delete(self.X_dr, diffSpikeOutliers, 0)
    


  def getOriginalData(self):
    return (self.X_dr, self.headers_dr)

  def getOriginalDF(self):
    return pd.DataFrame(data = self.X, columns = self.headers)

  def getClusteringTrainingData(self):
    X_dr_scaler = StandardScaler()
    X_train_dr = X_dr_scaler.fit_transform(self.X_dr)
    return (X_train_dr, self.headers_dr, X_dr_scaler)

  #Implement a method that provides labeled training data for ROP optimization




      
      




In [55]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
test_data = PrepareEDR("/content/gdrive/My Drive/COMP 642/proj/well_1.csv")

orig_data, orig_headers = test_data.getOriginalData()

Mounted at /content/gdrive/




In [56]:
print(orig_data)
print(orig_headers)
print(test_data.X_dr.shape)
print(test_data.X.shape)

[[7.60396040e-02 5.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [7.60396040e-02 5.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [7.60396040e-02 5.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [9.99026204e-01 8.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  8.54400000e+02 0.00000000e+00]
 [9.99026204e-01 8.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  5.05610000e+02 0.00000000e+00]
 [9.99026204e-01 8.00000000e-02 0.00000000e+00 ... 0.00000000e+00
  1.29590000e+02 0.00000000e+00]]
['Bit Depth / Hole Depth', 'Rotary RPM', 'Weight on Bit', 'Total Pump Output', 'Block Movement: + / - / 0', 'Differential Pressure', 'Hook Load', 'On Bottom ROP', 'Standpipe Pressure', 'Convertible Torque']
(28583, 10)
(28583, 12)


In [44]:
test_data.getOriginalDF()

Unnamed: 0,Bit Depth / Hole Depth,Hole Depth,Bit Depth,Rotary RPM,Weight on Bit,Total Pump Output,Block Height,Differential Pressure,Hook Load,On Bottom ROP,Standpipe Pressure,Convertible Torque
0,0.076040,2525.0,192.0,0.05,0.0,0.00,12.0,0.00,46.8,0.0,0.00,0.0
1,0.076040,2525.0,192.0,0.05,0.0,0.00,33.8,0.00,48.4,0.0,0.00,0.0
2,0.076040,2525.0,192.0,0.05,0.0,0.00,39.4,0.00,48.9,0.0,0.00,0.0
3,0.076040,2525.0,192.0,0.05,0.0,0.00,39.1,0.00,55.5,0.0,0.00,0.0
4,0.076040,2525.0,192.0,0.05,0.0,0.00,30.4,0.00,53.1,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
33979,0.999026,21565.1,21544.1,0.08,0.0,328.19,47.0,233.41,47.8,0.0,3432.08,0.0
33980,0.999026,21565.1,21544.1,0.08,0.0,107.91,47.0,0.00,47.8,0.0,2693.37,0.0
33981,0.999026,21565.1,21544.1,0.08,0.0,0.00,47.0,0.00,47.8,0.0,854.40,0.0
33982,0.999026,21565.1,21544.1,0.08,0.0,0.00,47.0,0.00,47.8,0.0,505.61,0.0


(array([   13,    14,    15, ..., 33554, 33555, 33556]),)