In [56]:
import numpy as np
import pandas as pd
from google.colab import drive
import sys
import seaborn as sns
import matplotlib.pyplot as plt

#For woring in Google Colab Directory
drive.mount('/content/gdrive/')
sys.path.append('/content/gdrive/My Drive/COMP 642/proj/')
path = '/content/gdrive/My Drive/COMP 642/proj/'

import prepare_edr

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
#K-means clustering
test_data = prepare_edr.PrepareEDR("/content/gdrive/My Drive/COMP 642/proj/well_1.csv")

#Architecture contains the scaler so we can quickly back-out the original data if we wanted to.
X_train, X_headers, scaler = test_data.getClusteringTrainingData()


In [58]:
#From prior elbow modeling, number of clusters was decided as 8. Tune hyperparameters such as n_init and max_iterations (too high and we may be over-fitting)
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

n_clusters = 12

#variables to capture our optimal hyperparameters
min_inertia = float("inf")
n_init_opt = 0
max_iter_opt = 0

for i in range(10, 40, 10):
  for j in range(300, 600, 100):
    km = KMeans(n_clusters=n_clusters, init='k-means++', n_init=i, max_iter=j, random_state=0)
    km.fit(X_train)
    if km.inertia_ < min_inertia:
      min_inertia = km.inertia_
      n_init_opt = i
      max_iter_opt = j

##re-train with the optimal parameters; and then view and reason about the centroids.
#output cluster values to reason about semantic meaning.
print("k selection: " + str(n_clusters))
print("Optimal n_init value: " + str(n_init_opt))
print("Optimal max_iter value: " + str(max_iter_opt))
km_final = KMeans(n_clusters=n_clusters, init='k-means++', n_init=n_init_opt, max_iter=max_iter_opt, random_state=0)
km_final.fit(X_train)
centroids = km_final.cluster_centers_
centroid_params = scaler.inverse_transform(centroids)
centroid_params_df = pd.DataFrame(data = centroid_params, columns=X_headers)
centroid_params_df

#Reason about what each category could suggest about rig operations
label_categories = ["One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", \
                    "Nine", "Ten", "Eleven", "Twelve",]

k selection: 12
Optimal n_init value: 20
Optimal max_iter value: 300


In [0]:
#Now, get the labels and apply them to the original data. 
#Then, visualize and reason about the accuracy of the labels.

#Reason about what each category could suggest about rig operations
label_categories = ["One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", \
                    "Nine", "Ten", "Eleven", "Twelve",]

origDataFrameWithClasses = test_data.getOriginalDF()
#Set 'Class' Column as the category that clustering picked; assigned with it's interpreted meaning.
origDataFrameWithClasses['Class'] = np.array(label_categories)[km_final.labels_]

#Finally, 

In [60]:

origDataFrameWithClasses = origDataFrameWithClasses[::-1]
origDataFrameWithClasses

Unnamed: 0,Bit Depth / Hole Depth,Hole Depth,Bit Depth,Rotary RPM,Weight on Bit,Total Pump Output,Block Height,Differential Pressure,Hook Load,On Bottom ROP,Standpipe Pressure,Convertible Torque,Class
28582,0.999026,21565.1,21544.1,0.08,0.0,0.00,47.0,0.00,47.8,0.0,129.59,0.0,Two
28581,0.999026,21565.1,21544.1,0.08,0.0,0.00,47.0,0.00,47.8,0.0,505.61,0.0,Two
28580,0.999026,21565.1,21544.1,0.08,0.0,0.00,47.0,0.00,47.8,0.0,854.40,0.0,Two
28579,0.999026,21565.1,21544.1,0.08,0.0,107.91,47.0,0.00,47.8,0.0,2693.37,0.0,Two
28578,0.999026,21565.1,21544.1,0.08,0.0,328.19,47.0,233.41,47.8,0.0,3432.08,0.0,Twelve
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0.076040,2525.0,192.0,0.05,0.0,0.00,30.4,0.00,53.1,0.0,0.00,0.0,Five
3,0.076040,2525.0,192.0,0.05,0.0,0.00,39.1,0.00,55.5,0.0,0.00,0.0,Five
2,0.076040,2525.0,192.0,0.05,0.0,0.00,39.4,0.00,48.9,0.0,0.00,0.0,Five
1,0.076040,2525.0,192.0,0.05,0.0,0.00,33.8,0.00,48.4,0.0,0.00,0.0,Five


In [0]:

#Data visualization
#First, output the same low and high res .png files as in preprocessing experimentation, 
#this time with hue by classification. Also, plot against the relevant original data, so we can 
#continue to reason about the quality of the categories

#low-res view, for inclusion in report.
plt.figure()
sns.pairplot(origDataFrameWithClasses, 
             x_vars=['Rotary RPM', 'Weight on Bit', 'Total Pump Output',\
                                               'Differential Pressure', 'Hook Load', 'Standpipe Pressure',\
                                               'Convertible Torque', 'On Bottom ROP'], 
             y_vars=['Bit Depth'], height=3, aspect=1,plot_kws={"s": 50}, hue="Class", palette='Dark2')
plt.gca().invert_yaxis()
plt.savefig(path + "visualization_kmeans_classes_bitdepth_lowres.png")

#detailed plot, for zoomed in view on individual facets. 
plt.figure()
sns.pairplot(origDataFrameWithClasses, 
             x_vars=['Rotary RPM', 'Weight on Bit', 'Total Pump Output',\
                                               'Differential Pressure', 'Hook Load', 'Standpipe Pressure',\
                                               'Convertible Torque', 'On Bottom ROP'], 
             y_vars=['Bit Depth'], height=10, aspect=1,plot_kws={"s": 100}, hue="Class", palette='Dark2')
plt.gca().invert_yaxis()
plt.savefig(path + "visualization_kmeans_classes_bitdepth_highres.png")

#Note that this doesn't seem right - we need to view parameters as a function of time, not just
#bit depth. 