In [1]:
# Modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Feature Selection
import xgboost as xgb
from sklearn.decomposition import PCA

# Evaluate Algorithms

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
from acquire_prepare import acquire_oil
from acquire_prepare import prep_data

import model
import numpy as np
import pandas as pd

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import sys 

**Let's bring in the DataFrame**

In [3]:
df = acquire_oil()
df = prep_data(df)

**(Rows, Columns)**

In [4]:
df.shape

(7888, 62)

**We need to make sure the numberic types are correct**

In [5]:
numerics = ['int64', 'float64']

num_df = df.select_dtypes(include=numerics)
num_df.columns

Index(['api14', 'proppant_ppf', 'frac_fluid_gpf', 'gross_perfs', 'frac_stages',
       'oil_gravity', 'peak_boepd', 'oil_hist', 'gas_hist', 'gor_hist',
       'ip90_boeqpd', 'tvd', 'sur_lat', 'sur_long', 'well_id', 'mid_point_lat',
       'mid_point_long', 'recovery', 'recovery_per_foot', 'months_active',
       'recovery_per_month', 'vintage', 'vintage_bin', 'encoded_direction',
       'encoded_frac_fluid_type', 'encoded_county', 'encoded_oper',
       'encoded_formation', 'encoded_sub-basin', 'encoded_lateral_class',
       'scaled_direction', 'scaled_frac_fluid_type', 'scaled_county',
       'scaled_oper', 'scaled_formation', 'scaled_sub-basin',
       'scaled_lateral_class', 'scaled_proppant_ppf', 'scaled_frac_fluid_gpf',
       'scaled_gross_perfs', 'scaled_frac_stages', 'scaled_oil_gravity',
       'scaled_gor_hist', 'scaled_tvd', 'scaled_mid_point_lat',
       'scaled_mid_point_long', 'scaled_vintage'],
      dtype='object')

In [6]:
num_df = num_df.drop(columns=['gas_hist', 'recovery', 'recovery_per_foot', 
                              'recovery_per_month', 'api14',
                              'sur_long', 'sur_lat', 'mid_point_lat', 'mid_point_long',
                              'well_id', 'oil_hist'])

### Use LabelEncoder to encode county and frak_fluid_type

In [7]:
encoder = LabelEncoder()

encoder.fit(df.county)
num_df['county'] = encoder.transform(df.county)

num_df.head()

Unnamed: 0,proppant_ppf,frac_fluid_gpf,gross_perfs,frac_stages,oil_gravity,peak_boepd,gor_hist,ip90_boeqpd,tvd,months_active,...,scaled_frac_fluid_gpf,scaled_gross_perfs,scaled_frac_stages,scaled_oil_gravity,scaled_gor_hist,scaled_tvd,scaled_mid_point_lat,scaled_mid_point_long,scaled_vintage,county
30,1311.78809,1642.58887,3977.0,0.0,0.0,61.408554,1738.53259,47.4529,10021.0,33,...,0.547863,0.292627,0.0,0.0,0.087056,0.745943,0.614855,0.580622,0.966667,0
31,1088.56238,1493.369,7283.0,0.0,0.0,416.961182,1381.571,351.7967,9764.0,66,...,0.498082,0.536127,0.0,0.0,0.069179,0.726813,0.51741,0.581774,0.916667,0
32,951.9706,1369.34363,7382.0,0.0,0.0,838.371765,1961.413,688.5254,10798.0,64,...,0.456706,0.543419,0.0,0.0,0.098218,0.803781,0.587606,0.571086,0.916667,0
33,1155.36267,1299.68433,7382.0,31.0,0.0,438.325684,1325.5387,406.427765,9506.0,63,...,0.433468,0.543419,0.442857,0.0,0.066373,0.707608,0.592304,0.57025,0.916667,0
34,1284.25537,1596.72742,7050.0,0.0,0.0,443.969543,2816.79077,289.934052,10138.0,59,...,0.532564,0.518966,0.0,0.0,0.141055,0.754652,0.576165,0.563362,0.933333,0


In [8]:
encoder = LabelEncoder()
encoder.fit(df.frac_fluid_type)
num_df['frac_fluid_type'] = encoder.transform(df.frac_fluid_type)
num_df.head()

Unnamed: 0,proppant_ppf,frac_fluid_gpf,gross_perfs,frac_stages,oil_gravity,peak_boepd,gor_hist,ip90_boeqpd,tvd,months_active,...,scaled_gross_perfs,scaled_frac_stages,scaled_oil_gravity,scaled_gor_hist,scaled_tvd,scaled_mid_point_lat,scaled_mid_point_long,scaled_vintage,county,frac_fluid_type
30,1311.78809,1642.58887,3977.0,0.0,0.0,61.408554,1738.53259,47.4529,10021.0,33,...,0.292627,0.0,0.0,0.087056,0.745943,0.614855,0.580622,0.966667,0,7
31,1088.56238,1493.369,7283.0,0.0,0.0,416.961182,1381.571,351.7967,9764.0,66,...,0.536127,0.0,0.0,0.069179,0.726813,0.51741,0.581774,0.916667,0,3
32,951.9706,1369.34363,7382.0,0.0,0.0,838.371765,1961.413,688.5254,10798.0,64,...,0.543419,0.0,0.0,0.098218,0.803781,0.587606,0.571086,0.916667,0,0
33,1155.36267,1299.68433,7382.0,31.0,0.0,438.325684,1325.5387,406.427765,9506.0,63,...,0.543419,0.442857,0.0,0.066373,0.707608,0.592304,0.57025,0.916667,0,3
34,1284.25537,1596.72742,7050.0,0.0,0.0,443.969543,2816.79077,289.934052,10138.0,59,...,0.518966,0.0,0.0,0.141055,0.754652,0.576165,0.563362,0.933333,0,7


### Standardize Numeric Data

In [9]:
scaled_df = num_df.copy()

scaled_df['proppant_ppf'] = (scaled_df.proppant_ppf - scaled_df.proppant_ppf.min()) / (scaled_df.proppant_ppf.max() - scaled_df.proppant_ppf.min())
scaled_df['frac_fluid_gpf'] = (scaled_df.frac_fluid_gpf - scaled_df.frac_fluid_gpf.min()) / (scaled_df.frac_fluid_gpf.max() - scaled_df.frac_fluid_gpf.min())
scaled_df['tvd'] = (scaled_df.tvd - scaled_df.tvd.min()) / (scaled_df.tvd.max() - scaled_df.tvd.min())
scaled_df['frac_stages'] = (scaled_df.frac_stages - scaled_df.frac_stages.min()) / (scaled_df.frac_stages.max() - scaled_df.frac_stages.min())
scaled_df['oil_gravity'] = (scaled_df.oil_gravity - scaled_df.oil_gravity.min()) / (scaled_df.oil_gravity.max() - scaled_df.oil_gravity.min())
scaled_df['peak_boepd'] = (scaled_df.peak_boepd - scaled_df.peak_boepd.min()) / (scaled_df.peak_boepd.max() - scaled_df.peak_boepd.min())
scaled_df['gor_hist'] = (scaled_df.gor_hist - scaled_df.gor_hist.min()) / (scaled_df.gor_hist.max() - scaled_df.gor_hist.min())
scaled_df['ip90_boeqpd'] = (scaled_df.ip90_boeqpd - scaled_df.ip90_boeqpd.min()) / (scaled_df.ip90_boeqpd.max() - scaled_df.ip90_boeqpd.min())
scaled_df['months_active'] = (scaled_df.months_active - scaled_df.months_active.min()) / (scaled_df.months_active.max() - scaled_df.months_active.min())
scaled_df['vintage'] = (scaled_df.vintage - scaled_df.vintage.min()) / (scaled_df.vintage.max() - scaled_df.vintage.min())

In [10]:
xgb_df = scaled_df.copy()

### XGBoost Feature Recommender

In [None]:
xgb_params = {
    'max_depth': 8,
    'seed' : 493
}

dtrain = xgb.DMatrix(xgb_df, df.oil_hist, feature_names=xgb_df.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()

# Test Harness and Options

In [10]:
xfeatures = ['proppant_ppf', 'frac_fluid_gpf', 'gor_hist', 'tvd', 'sur_lat', 'mid_point_lat', 'mid_point_long', 'sur_long', 'encoded_oper', 'encoded_direction', 'vintage', 'gross_perfs']
yfeature = ['recovery']

### Linear Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [11]:
model.lregression_test(df, xfeatures, yfeature, 0.70)

(14058.18339234282,
 0.27756915826909667,
 array([[ 8.06029842e-03,  7.78511902e-03,  5.10354334e-03,
          1.15534555e-02, -4.38431594e+02,  4.69631437e+02,
          2.33028731e+03, -2.36404799e+03, -1.10018582e-01,
          6.69020602e+01, -2.24497837e+00,  2.08220019e-02]]))

### Ridge Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [12]:
model.rregression_test(df, xfeatures, yfeature, 0.70)

(14084.696816831223,
 0.27620666960069884,
 array([[ 7.84709009e-03,  8.34689331e-03,  5.17017398e-03,
          1.15783019e-02, -2.52499867e+01,  5.66460705e+01,
          3.99474026e+01, -7.30493173e+01, -1.07953764e-01,
          6.68170450e+01, -2.23125933e+00,  2.06552122e-02]]))

### Poloynomial Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [13]:
model.pregression_test(df, xfeatures, yfeature, 0.70)

(6616.7993618820465,
 0.6599717190222025,
 array([[ 1.93034254e+13, -4.75299481e+12,  1.86172615e+13, ...,
         -7.88257187e+09, -1.00372925e+02,  1.43703613e+01]]))

## Explore and select algorithms

**Let's Cluster by our Top Seven features per our XGBoost function**

In [14]:
col_list = ['proppant_ppf', 'gross_perfs', 'tvd', 'frac_fluid_gpf', 'gor_hist', 'encoded_oper', 'vintage']
col_list = pd.DataFrame(preprocessing.scale(col_list))

ValueError: could not convert string to float: 'proppant_ppf'

**We now have a new dataframe with an added column "cluster_id"**

In [16]:
df = model.create_kclusters(df, col_list, 3, 'cluster_id')

In [17]:
df.head()

Unnamed: 0,api14,lease_name,well_number,county,direction,status,oper,multi_well_lease,major_phase,formation,...,scaled_frac_fluid_gpf,scaled_gross_perfs,scaled_frac_stages,scaled_oil_gravity,scaled_gor_hist,scaled_tvd,scaled_mid_point_lat,scaled_mid_point_long,scaled_vintage,cluster_id
30,42003452730000,UL III 4 EAST,1H,ANDREWS,Horizontal,Active,diamondback exploration & prod llc,False,OIL,WOLFCAMP,...,0.547863,0.292627,0.0,0.0,0.087056,0.745943,0.614855,0.580622,0.966667,0
31,42003452810000,CROSS BAR RANCH,2017WB,ANDREWS,Horizontal,Active,cog,True,OIL,WOLFCAMP,...,0.498082,0.536127,0.0,0.0,0.069179,0.726813,0.51741,0.581774,0.916667,0
32,42003453090000,UNIVERSITY 7-43,10H,ANDREWS,Horizontal,Active,pioneer,True,OIL,WOLFCAMP,...,0.456706,0.543419,0.0,0.0,0.098218,0.803781,0.587606,0.571086,0.916667,0
33,42003454960000,UNIVERSITY 7-43,16H,ANDREWS,Horizontal,Active,pioneer,True,OIL,WOLFCAMP,...,0.433468,0.543419,0.442857,0.0,0.066373,0.707608,0.592304,0.57025,0.916667,0
34,42003455330000,MABEE 240B,2HW,ANDREWS,Horizontal,Active,cog,True,OIL,WOLFCAMP,...,0.532564,0.518966,0.0,0.0,0.141055,0.754652,0.576165,0.563362,0.933333,0


In [26]:
df.groupby('cluster_id')['proppant_ppf', 'gross_perfs', 'tvd', 'frac_fluid_gpf', 'gor_hist', 'encoded_oper', 'vintage'].describe().T

Unnamed: 0,cluster_id,0,1,2
proppant_ppf,count,5277.0,1422.0,1189.0
proppant_ppf,mean,1315.730495,406.497282,1165.516714
proppant_ppf,std,591.251706,420.538936,562.642804
proppant_ppf,min,2.748414,5.586592,1.098097
proppant_ppf,25%,1017.46149,181.868832,896.5774
proppant_ppf,50%,1328.39575,303.11045,1136.50879
proppant_ppf,75%,1715.02783,461.770927,1477.50354
proppant_ppf,max,3944.87476,3969.38452,3378.37842
gross_perfs,count,5277.0,1422.0,1189.0
gross_perfs,mean,6763.000948,1383.475387,6362.600505


## Interpret and report results

# Improve Results

## Algorithm Tuning

## Ensemble Methods

### Bagging

### Boosting

### Blending

## Extreme Feature Engineering