# Prediction of ROP (Rate of Penetration)

# **1. Import**

In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
# import sklearn libararies
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# Used for checking Distribution curve
from scipy.stats import skew

# **2. Read Data**

In [2]:
data = pd.read_csv('tech_challenge2021_train.csv')

# **3. Glimps Data**

In [3]:
data.sample(10)

Unnamed: 0,Well Identifier,Measured Depth m,Weight on Bit kkgf,Average Standpipe Pressure kPa,Average Surface Torque kN.m,Rate of Penetration m/h,Average Rotary Speed rpm,Mud Flow In L/min,Mud Density In g/cm3,Diameter mm,Average Hookload kkgf,Hole Depth (TVD) m,USROP Gamma gAPI
117673,USROP_A 3 N-SH-F-15d,3906.372,8.251752,15962.68003,25.597843,24.880824,139.736,2077.184268,11.182843,215.9,140.016254,3071.076713,77.621
129422,USROP_A 4 N-SH_F-15Sd,1888.154,4.74168,21826.59527,10.81,25.106376,119.49,3987.855834,1.45,311.15,132.950605,1797.521424,103.52
97877,USROP_A 3 N-SH-F-15d,2976.125,4.111815,14808.95961,20.653175,18.223992,139.736,2121.378954,11.182843,215.9,126.171707,2703.744859,7.347
180154,USROP_A 5 N-SH-F-5d,3277.237,3.48743,24322.99957,21.77,7.75,303.630005,2093.560059,1.45,215.9,138.477465,2900.533188,28.55
42662,USROP_A 2 N-SH_F-14d,2223.409,5.261735,15824.00055,10.6,13.14,119.639999,3919.709961,1.32,444.5,146.155927,2222.517475,56.132
134503,USROP_A 4 N-SH_F-15Sd,2225.616,3.059149,23269.04744,12.6,20.144232,140.0,3987.855834,1.46,311.15,138.314306,2090.565288,68.89
198752,USROP_A 6 N-SH_F-9d,625.849,5.741003,10200.0,5.68,45.189999,143.0,3434.989991,1.03,444.5,101.339394,625.653511,84.578
20831,USROP_A 2 N-SH_F-14d,1060.939,5.42489,7438.999938,8.22,7.32,70.050003,3083.729979,1.02,444.5,103.888685,1060.449492,22.48
22968,USROP_A 2 N-SH_F-14d,1125.931,5.333116,14369.9997,5.880001,15.600001,120.139999,4233.020018,1.19,444.5,105.142939,1125.441386,109.241
98902,USROP_A 3 N-SH-F-15d,3007.614,5.048029,14670.51978,23.469209,19.203924,139.736,2121.378954,11.182843,215.9,125.989817,2723.45597,14.002


In [4]:
data.describe()

Unnamed: 0,Measured Depth m,Weight on Bit kkgf,Average Standpipe Pressure kPa,Average Surface Torque kN.m,Rate of Penetration m/h,Average Rotary Speed rpm,Mud Flow In L/min,Mud Density In g/cm3,Diameter mm,Average Hookload kkgf,Hole Depth (TVD) m,USROP Gamma gAPI
count,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0
mean,2411.78137,6.087997,17451.302922,14.430343,24.978426,152.368019,2873.576072,4.008286,297.470693,127.503708,2153.624785,67.197939
std,1066.443448,3.957126,4236.531326,7.239629,15.086731,49.998842,1055.406219,4.496288,100.344171,15.55189,846.519288,50.519888
min,225.171,0.001814,1432.661618,0.008135,0.33,0.0,185.420836,1.02,215.9,84.047945,225.16277,0.0
25%,1548.68675,3.283486,14655.99976,9.639866,12.99,129.25,1993.927917,1.3,215.9,124.925435,1516.59336,25.38
50%,2697.1825,5.190356,16683.99963,12.798921,21.09,139.736,2121.382739,1.35,215.9,130.905851,2528.949222,54.49
75%,3288.9465,8.127138,21153.52906,17.815448,32.470001,179.283,3987.855834,10.849026,444.5,138.719433,2865.091577,98.14
max,4090.001,31.411272,24998.45941,36.489128,99.206304,311.230011,4538.450195,12.017384,444.5,152.926842,3248.389893,260.899


# **4. Exploratory Data Analysis**

In [5]:
# remove all rows that contains missing value
data.replace(['-999', -999], np.nan, inplace=True)
data.dropna(axis=0, inplace=True)
data.describe()

Unnamed: 0,Measured Depth m,Weight on Bit kkgf,Average Standpipe Pressure kPa,Average Surface Torque kN.m,Rate of Penetration m/h,Average Rotary Speed rpm,Mud Flow In L/min,Mud Density In g/cm3,Diameter mm,Average Hookload kkgf,Hole Depth (TVD) m,USROP Gamma gAPI
count,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0,198928.0
mean,2411.78137,6.087997,17451.302922,14.430343,24.978426,152.368019,2873.576072,4.008286,297.470693,127.503708,2153.624785,67.197939
std,1066.443448,3.957126,4236.531326,7.239629,15.086731,49.998842,1055.406219,4.496288,100.344171,15.55189,846.519288,50.519888
min,225.171,0.001814,1432.661618,0.008135,0.33,0.0,185.420836,1.02,215.9,84.047945,225.16277,0.0
25%,1548.68675,3.283486,14655.99976,9.639866,12.99,129.25,1993.927917,1.3,215.9,124.925435,1516.59336,25.38
50%,2697.1825,5.190356,16683.99963,12.798921,21.09,139.736,2121.382739,1.35,215.9,130.905851,2528.949222,54.49
75%,3288.9465,8.127138,21153.52906,17.815448,32.470001,179.283,3987.855834,10.849026,444.5,138.719433,2865.091577,98.14
max,4090.001,31.411272,24998.45941,36.489128,99.206304,311.230011,4538.450195,12.017384,444.5,152.926842,3248.389893,260.899


<b>The above table shows that the there is considerable difference between the values between mean and 50% percentile of the data. This shows that the data is not distributed evenly.</b>

In [6]:
#Checking for any missing or nan values
data.isna().any()

Well Identifier                   False
Measured Depth m                  False
Weight on Bit kkgf                False
Average Standpipe Pressure kPa    False
Average Surface Torque kN.m       False
Rate of Penetration m/h           False
Average Rotary Speed rpm          False
Mud Flow In L/min                 False
Mud Density In g/cm3              False
Diameter mm                       False
Average Hookload kkgf             False
Hole Depth (TVD) m                False
USROP Gamma gAPI                  False
dtype: bool

In [7]:
# Identifying the numbers of unique wells whose data has been used
wells = data['Well Identifier'].unique()
wells

array(['USROP_A 0 N-NA_F-9_Ad', 'USROP_A 1 N-S_F-7d',
       'USROP_A 2 N-SH_F-14d', 'USROP_A 3 N-SH-F-15d',
       'USROP_A 4 N-SH_F-15Sd', 'USROP_A 5 N-SH-F-5d',
       'USROP_A 6 N-SH_F-9d'], dtype=object)

In [8]:
len(wells)

7

In [9]:
data.columns.tolist()

['Well Identifier',
 'Measured Depth m',
 'Weight on Bit kkgf',
 'Average Standpipe Pressure kPa',
 'Average Surface Torque kN.m',
 'Rate of Penetration m/h',
 'Average Rotary Speed rpm',
 'Mud Flow In L/min',
 'Mud Density In g/cm3',
 'Diameter mm',
 'Average Hookload kkgf',
 'Hole Depth (TVD) m',
 'USROP Gamma gAPI']

In [10]:
data.skew(axis = 0, skipna = True)

Measured Depth m                 -0.345659
Weight on Bit kkgf                0.906045
Average Standpipe Pressure kPa   -0.129656
Average Surface Torque kN.m       0.529987
Rate of Penetration m/h           0.998834
Average Rotary Speed rpm          0.457813
Mud Flow In L/min                 0.322860
Mud Density In g/cm3              1.062461
Diameter mm                       0.626293
Average Hookload kkgf            -1.118999
Hole Depth (TVD) m               -0.690687
USROP Gamma gAPI                  0.903386
dtype: float64

<b>Skewness of the data shows us that Data Distribution Curve is not properly distributed.</b>

## Data Scaling
<b>This brings us to the possibility of using different scalers to scale our data.</b>
<ol>
    <li>StandardScaler</li>
    <li>MinMaxScaler</li>
    <li>MaxAbsScaler</li>
    <li>RobustScaler</li>
    <li>Quartile Transformer</li>
    <li>PowerTransformer (Yeo-Johnson and Box-Cox)</li>
</ol>

<b>We eliminate the use of RobustScaler and QuartileTransformer since they balance out data by adding or removing outliers which may affect our data and the relation between our columns since ROP is highly variable.</b>

<b>We also rule out the use of Box-Cox Power Transformer since our data rows has 0 values on which Box-Cox won't work.</b>

This leaves us with possible transforms -
<ol>
    <li>StandardScaler</li>
    <li>MinMaxScaler</li>
    <li>MaxAbsScaler</li>
    <li>PowerTransformer (Yeo-Johnson)</li>
</ol>