In [131]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge, SGDRegressor
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt

In [132]:
# read in data, trim off the top blank rows
df = pd.read_csv("conoco_data.csv")
df.columns = df.iloc[15]
df = df[16:]
df.fillna(0, inplace=True)
df.reset_index(inplace=True)
df.head()
copy_df = df.copy(deep=True)
y = df["DAYSTILFAIL"].to_numpy()
drop_cols = ['DAYSTILFAIL', 'WELLNAME', 'index', 'CURRENTWELLSTATUS1', 'DTTMSPUD', 'aRod_DTTMRUN', 'aRod_DTTMPULL', 'POPDATE', 'FAILUREDATE', 
                'JobATub_DTTMRUN', 'JobATub_DTTMPULL', 'HAS_FAILED']

In [133]:
df.drop(columns=drop_cols, inplace=True)
X_cols = df.columns
X = df.to_numpy()
scaler = MinMaxScaler()

y = y.reshape(-1, 1)
X_minmax = scaler.fit_transform(X)
y_minmax = scaler.fit_transform(y)

#X_train, X_test, y_train, y_test = train_test_split(X_minmax, y_minmax, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [134]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

# use the random forest regressor to determine which features are the most important
importances = rfr.feature_importances_
feature_dict = {feature:value for (feature, value) in zip(X_cols, importances)}
feature_dict

  


{'TDCALC': 0.06968551276109525,
 'ROD_DEPTHBTM': 0.0568686895787288,
 'TUB_DEPTHBTM': 0.06582101006515423,
 'TUBINGPULL': 0.037325847184720255,
 'median_Consecutive_Pumpoff_Strokes_Allowed': 0.00669745609648366,
 'mean_Consecutive_Pumpoff_Strokes_Allowed': 0.05540702930308411,
 'median_Current_PIP': 0.021920950008658614,
 'mean_Current_PIP': 0.006238232096098511,
 'median_Cycles': 0.19809192603958994,
 'mean_Cycles': 0.03061721656335866,
 'median_Fluid_Load': 0.019062531396477296,
 'mean_Fluid_Load': 0.022187547605837975,
 'median_Last_Stroke_Min_Load': 0.016367527304675725,
 'mean_Last_Stroke_Min_Load': 0.017443956615546048,
 'median_Last_Stroke_Peak_Load': 0.013377287962939868,
 'mean_Last_Stroke_Peak_Load': 0.011596933013343908,
 'median_STROKE_LENGTH': 0.016502061201714272,
 'mean_STROKE_LENGTH': 0.028250662530605007,
 'median_Pump_Size': 0.004521706484906694,
 'mean_Pump_Size': 0.009329099821332804,
 'median_Reference_Pump_Fillage': 0.006577499744219299,
 'mean_Reference_Pump_Fill

In [135]:
# here is our r^2 value
rfr.score(X_test, y_test)

0.36037280500980207

In [136]:
# get a list of all importances, sorted
sorted_importances = sorted(feature_dict.items(), key=lambda x: x[1], reverse=True)
sorted_importances

[('median_Cycles', 0.19809192603958994),
 ('mean_Percent_Run', 0.07313479119165099),
 ('TDCALC', 0.06968551276109525),
 ('TUB_DEPTHBTM', 0.06582101006515423),
 ('ROD_DEPTHBTM', 0.0568686895787288),
 ('mean_Consecutive_Pumpoff_Strokes_Allowed', 0.05540702930308411),
 ('mean_Yesterday_Min_Load', 0.04697545389836089),
 ('TUBINGPULL', 0.037325847184720255),
 ('mean_Total_Strokes', 0.03497829784967906),
 ('mean_Cycles', 0.03061721656335866),
 ('mean_STROKE_LENGTH', 0.028250662530605007),
 ('median_Total_Strokes', 0.02618399318635034),
 ('mean_Reference_Pump_Fillage', 0.025808801418708328),
 ('median_Percent_Run', 0.024926655814903005),
 ('mean_Fluid_Load', 0.022187547605837975),
 ('median_Current_PIP', 0.021920950008658614),
 ('median_Yesterday_Min_Load', 0.020737425532219512),
 ('median_Fluid_Load', 0.019062531396477296),
 ('mean_Last_Stroke_Min_Load', 0.017443956615546048),
 ('median_Yesterday_Peak_Load', 0.01708400775798914),
 ('median_STROKE_LENGTH', 0.016502061201714272),
 ('median_Las

In [137]:
cols_to_keep = [x[0] for x in sorted_importances[:5]]
more_cols_to_drop = [x[0] for x in sorted_importances[5:]]

In [138]:
# create new array of cols to drop
drop_cols = drop_cols + more_cols_to_drop

In [139]:
fresh_df = copy_df.copy(deep=True)
fresh_df.drop(columns=drop_cols, inplace=True)
X_cols = fresh_df.columns
X = fresh_df.to_numpy()
scaler = MinMaxScaler()

y = y.reshape(-1, 1)
X_minmax = scaler.fit_transform(X)
y_minmax = scaler.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_minmax, y_minmax, test_size=0.33, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [140]:
# perform the same function we did before, except just with these 5 features
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

# use the random forest regressor to determine which features are the most important
importances = rfr.feature_importances_
feature_dict = {feature:value for (feature, value) in zip(X_cols, importances)}
feature_dict

  This is separate from the ipykernel package so we can avoid doing imports until


{'TDCALC': 0.18437137413883664,
 'ROD_DEPTHBTM': 0.15786664140900955,
 'TUB_DEPTHBTM': 0.1606719744994709,
 'median_Cycles': 0.2582283264417272,
 'mean_Percent_Run': 0.2388616835109556}

In [141]:
# get a list of all importances, sorted
sorted_importances = sorted(feature_dict.items(), key=lambda x: x[1], reverse=True)
sorted_importances

[('median_Cycles', 0.2582283264417272),
 ('mean_Percent_Run', 0.2388616835109556),
 ('TDCALC', 0.18437137413883664),
 ('TUB_DEPTHBTM', 0.1606719744994709),
 ('ROD_DEPTHBTM', 0.15786664140900955)]

In [149]:
sgd = SGDRegressor(max_iter=100000, tol=1)
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.15917327912088106
Mean Squared Error: 0.048729885664647594
Root Mean Squared Error: 0.22074846695877096


  y = column_or_1d(y, warn=True)
