In [1]:
import pandas as pd
from os import listdir, path
from os.path import isfile, join
import logging
import matplotlib.pyplot as plt
import datetime

In [2]:
file_name = "logs"

logging.basicConfig(
    filename=file_name,
    filemode='a',
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

In [3]:
# we only want to do this if there isn't a pickle present
if not path.exists("./df.pkl"):
    # get list of all csv file
    DIR_NAME = "rod_pump"
    file_names = [f for f in listdir(DIR_NAME) if isfile(join(DIR_NAME, f))]
    
    logging.info("Files: ")
    logging.info(file_names)
    
    # create a list of dataframes for each csv file in ./rod_pump
    # then concat to create master dataframe
    plt.figure(figsize=(20,10))
    df_list = []
    names_length = len(file_names)
    file_number = 1
    for file in file_names:
        df = pd.read_csv(f'{DIR_NAME}/{file}')
        df['Name'] = file.replace('.csv', '')
        
        logging.info(f"[{file_number}/{names_length}] {file}")

        END = len(df["Tubing Pressure (psi)"]) - 10
        diff = 22
        START = END - diff
        time = [x for x in range(0, diff)]
        plt.plot(time, df["Fluid Load (lbs)"][START:END], 
                 label="Fluid Load", c='green', alpha=0.5)

        failing = []

        for i, row in df.iterrows():
          is_zero = not dict(df.loc[i])["Percent Run (%)"]
          fail = 1 if i > START and not is_zero else 0
          failing.append(fail)

        df["failing"] = failing
        df_list.append(df)
        file_number += 1

    df = pd.concat(df_list)
    df.to_pickle("./df.pkl")
else:
    df = pd.read_pickle("./df.pkl")

In [4]:
# create copy, remove misc. features from X
X = df.copy(deep=True)
del X["Percent Run (%)"]
del X["time (hours)"]
del X["Name"]

# remove all rows where the rod pump is 'dead'
X = X[X["Casing Pressure (psi)"] > 0]

# move response series into y
y = X["failing"]
del X["failing"]

X = X.iloc[:]
y = y.iloc[:]

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit_transform(X)

array([[ 0.281472  ,  0.46089571, -1.37385484, -0.06576956,  0.22475348,
        -7.16698449],
       [-0.32864205, -0.10832508, -0.69451287, -1.45332465,  1.86850652,
        -8.56788936],
       [ 0.15298915, -0.16840052,  0.31632977, -1.16147714,  1.21576266,
        -9.94499497],
       ...,
       [ 4.00284166,  3.79621005,  1.27551372, -1.42221323,  1.61538223,
         8.58574696],
       [ 2.92974364,  3.15069308, -1.37219018,  1.18370462, -1.57134731,
         9.50943103],
       [ 3.03425129,  3.42708158,  1.27551372, -0.29709177,  0.61203564,
        10.32682037]])

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
X_train

Unnamed: 0,Casing Pressure (psi),Tubing Pressure (psi),Pump Fillage (%),Peak Load (lbs),Minimum Load (lbs),Fluid Load (lbs)
50870,6027.336234,5214.920111,41.754656,5497.986448,8348.958515,9801.320501
58218,4557.261831,3465.382104,65.371547,9899.039893,8421.488207,9757.507039
18061,6566.533217,5628.916972,50.384549,12489.844485,7470.742021,9704.778075
24168,6987.051457,6552.930907,34.807950,6673.047018,9862.763808,9910.182306
56324,7149.533946,6482.838901,20.359997,10431.446034,8174.534681,9537.499335
...,...,...,...,...,...,...
38495,6973.148886,5896.567429,31.169446,13236.654416,5937.803002,9743.317013
33295,4001.238037,2933.013575,42.543952,6604.621270,7787.563306,8834.576480
12992,5669.503828,4169.038463,21.215059,10136.828130,5892.605088,9549.224604
23400,6398.498029,6052.308552,44.677113,15851.383661,5574.000604,9364.689632


In [None]:
from keras import Sequential
from keras.layers import Dense

classifier = Sequential()

classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal',
                     input_dim=6))
classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal'))
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

classifier.fit(X_train, y_train, batch_size=32, epochs=100)

eval_model = classifier.evaluate(X_train, y_train)
eval_model




Epoch 1/100





Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


[0.007667238328130455, 0.9995243148303999]

In [None]:
y_pred = classifier.predict(X_test)
y_pred = y_pred > 0.5

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
logging.info(cm)

[[23416     0]
 [   10     0]]


In [4]:
import time
logging.info("Saving model")
curr_time = int(time.time())
model_name = f"{curr_time}.h5"
classifier.save(f"./models/{model_name}")

NameError: name 'classifier' is not defined