In [2]:
# Modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Feature Selection
import xgboost as xgb
from sklearn.decomposition import PCA

# Evaluate Algorithms

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
from acquire_prepare import acquire_oil
from acquire_prepare import prep_data

import model
import explore
import numpy as np
import pandas as pd

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import sys 

**Let's bring in the DataFrame**

In [4]:
df = acquire_oil()
df = prep_data(df)

**(Rows, Columns)**

In [5]:
df.shape

(7888, 62)

**We need to make sure the numberic types are correct**

In [6]:
numerics = ['int64', 'float64']

num_df = df.select_dtypes(include=numerics)

In [7]:
num_df = num_df.drop(columns=['gas_hist', 'recovery', 'recovery_per_foot', 
                              'recovery_per_month', 'api14',
                              'sur_long', 'sur_lat', 'mid_point_lat', 'mid_point_long',
                              'well_id', 'oil_hist'])

### Use LabelEncoder to encode county and frak_fluid_type

In [8]:
encoder = LabelEncoder()

encoder.fit(df.county)
num_df['county'] = encoder.transform(df.county)

In [9]:
encoder = LabelEncoder()
encoder.fit(df.frac_fluid_type)
num_df['frac_fluid_type'] = encoder.transform(df.frac_fluid_type)

### Standardize Numeric Data

In [10]:
scaled_df = num_df.copy()

scaled_df['proppant_ppf'] = (scaled_df.proppant_ppf - scaled_df.proppant_ppf.min()) / (scaled_df.proppant_ppf.max() - scaled_df.proppant_ppf.min())
scaled_df['frac_fluid_gpf'] = (scaled_df.frac_fluid_gpf - scaled_df.frac_fluid_gpf.min()) / (scaled_df.frac_fluid_gpf.max() - scaled_df.frac_fluid_gpf.min())
scaled_df['tvd'] = (scaled_df.tvd - scaled_df.tvd.min()) / (scaled_df.tvd.max() - scaled_df.tvd.min())
scaled_df['frac_stages'] = (scaled_df.frac_stages - scaled_df.frac_stages.min()) / (scaled_df.frac_stages.max() - scaled_df.frac_stages.min())
scaled_df['oil_gravity'] = (scaled_df.oil_gravity - scaled_df.oil_gravity.min()) / (scaled_df.oil_gravity.max() - scaled_df.oil_gravity.min())
scaled_df['peak_boepd'] = (scaled_df.peak_boepd - scaled_df.peak_boepd.min()) / (scaled_df.peak_boepd.max() - scaled_df.peak_boepd.min())
scaled_df['gor_hist'] = (scaled_df.gor_hist - scaled_df.gor_hist.min()) / (scaled_df.gor_hist.max() - scaled_df.gor_hist.min())
scaled_df['ip90_boeqpd'] = (scaled_df.ip90_boeqpd - scaled_df.ip90_boeqpd.min()) / (scaled_df.ip90_boeqpd.max() - scaled_df.ip90_boeqpd.min())
scaled_df['months_active'] = (scaled_df.months_active - scaled_df.months_active.min()) / (scaled_df.months_active.max() - scaled_df.months_active.min())
scaled_df['vintage'] = (scaled_df.vintage - scaled_df.vintage.min()) / (scaled_df.vintage.max() - scaled_df.vintage.min())

In [10]:
xgb_df = scaled_df.copy()

In [12]:
explore.xgb_rank(df,['recovery'])

KeyError: "['clusterid'] not found in axis"

### XGBoost Feature Recommender

In [None]:
xgb_params = {
    'max_depth': 8,
    'seed' : 493
}

dtrain = xgb.DMatrix(xgb_df, df.oil_hist, feature_names=xgb_df.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()

# Test Harness and Options

In [11]:
xfeatures = ['proppant_ppf', 'frac_fluid_gpf', 'gor_hist', 'tvd', 'sur_lat', 'mid_point_lat', 'mid_point_long', 'sur_long', 'encoded_oper', 'encoded_direction', 'vintage', 'gross_perfs']
yfeature = ['recovery']

### Linear Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [12]:
model.lregression_test(df, xfeatures, yfeature, 0.70)

(14058.18339234282,
 0.27756915826909667,
 array([[ 8.06029842e-03,  7.78511902e-03,  5.10354334e-03,
          1.15534555e-02, -4.38431594e+02,  4.69631437e+02,
          2.33028731e+03, -2.36404799e+03, -1.10018582e-01,
          6.69020602e+01, -2.24497837e+00,  2.08220019e-02]]),
 array([0.28841154, 0.24584862, 0.28595999]))

### Ridge Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [13]:
model.rregression_test(df, xfeatures, yfeature, 0.70)

(14084.696816831223,
 0.27620666960069884,
 array([[ 7.84709009e-03,  8.34689331e-03,  5.17017398e-03,
          1.15783019e-02, -2.52499867e+01,  5.66460705e+01,
          3.99474026e+01, -7.30493173e+01, -1.07953764e-01,
          6.68170450e+01, -2.23125933e+00,  2.06552122e-02]]),
 array([0.28771523, 0.24438187, 0.28420503]))

### Poloynomial Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [14]:
model.pregression_test(df, xfeatures, yfeature, 0.70)

(6616.7993618820465,
 0.6599717190222025,
 array([[ 1.93034254e+13, -4.75299481e+12,  1.86172615e+13, ...,
         -7.88257187e+09, -1.00372925e+02,  1.43703613e+01]]),
 array([0.31119915, 0.29688435, 0.34296775]))

## Explore and select algorithms

**Let's Cluster by our Top Seven features per our XGBoost function**

In [15]:
col_list = ['scaled_proppant_ppf', 'scaled_gross_perfs', 'scaled_tvd', 'scaled_frac_fluid_gpf', 
            'scaled_gor_hist', 'encoded_oper', 'scaled_vintage']

**We now have a new dataframe with an added column "cluster_id"**

In [16]:
df = model.create_kclusters(df, col_list, 3, 'cluster_id')

**Let's see how many oberservations we have in each cluster**

In [17]:
df.cluster_id.value_counts()

1    3693
2    2331
0    1864
Name: cluster_id, dtype: int64

# ALL Regression Types on Each Cluster

### Cluster 0: 2331 Oberservations 

In [18]:
cluster_zero = df[df['cluster_id'] == 0]

### Linear Regression Model: (70% of Cluster 0)
**Returns:**
> MSE, R2 Score, and Coefficients

In [19]:
model.lregression_test(cluster_zero, xfeatures, yfeature, 0.70)

(8250.864499658828,
 0.26948336039163756,
 array([[ 1.00968527e-02,  8.69437837e-03,  3.14426359e-03,
          9.48379208e-03,  6.09235269e+01, -3.78271792e+01,
          2.53723070e+03, -2.55124770e+03, -2.43624409e-01,
          3.64920864e+01, -1.07448185e+00,  1.47777232e-02]]),
 array([0.26427131, 0.171328  , 0.30678772]))

### Ridge Regression Model: (70% of Cluster 0)
**Returns:**
> MSE, R2 Score, and Coefficients

In [20]:
model.rregression_test(cluster_zero, xfeatures, yfeature, 0.70)

(8262.984416136987,
 0.2684102848783596,
 array([[ 9.70402106e-03,  8.32725711e-03,  2.92799847e-03,
          1.00126109e-02,  3.82076690e+01, -1.39862463e+01,
          1.09005663e+01, -2.17554094e+01, -2.55904004e-01,
          1.92448678e+01, -1.90269981e+00,  1.44288414e-02]]),
 array([0.26862853, 0.1743481 , 0.29909387]))

### Poloynomial Regression Model: (70% of Cluster 0)
**Returns:**
> MSE, R2 Score, and Coefficients

In [21]:
model.pregression_test(cluster_zero, xfeatures, yfeature, 0.70)

(2.09052025853496e-15,
 1.0,
 array([[  569.80786631,   569.80755276,  4740.62275672, ...,
         -1304.39382111,  -761.48231927,   199.83695861]]),
 array([0.29236026, 0.23034158, 0.19822565]))

### Cluster 1: 3693 Observations

In [22]:
cluster_one = df[df['cluster_id'] == 1]

### Linear Regression Model: (70% of Cluster 1)
**Returns:**
> MSE, R2 Score, and Coefficients

In [23]:
model.lregression_test(cluster_one, xfeatures, yfeature, 0.70)

(15192.990869385918,
 0.3010714087401173,
 array([[ 1.62890942e-02,  2.33847206e-03,  5.99180368e-03,
          1.07188473e-02, -8.32944069e+02,  8.64385972e+02,
          3.70789250e+03, -3.75703724e+03, -1.80954844e-02,
          3.92416896e+01, -3.02235351e+00,  1.75751979e-02]]),
 array([0.28614013, 0.27080011, 0.3255136 ]))

### Ridge Regression Model: (70% of Cluster 1)
**Returns:**
> MSE, R2 Score, and Coefficients

In [24]:
model.rregression_test(cluster_one, xfeatures, yfeature, 0.70)

(15239.094020727387,
 0.2989505089846476,
 array([[ 1.60728300e-02,  3.00647029e-03,  6.43671623e-03,
          1.02479510e-02, -4.21135512e+01,  7.65170566e+01,
          1.91299602e+01, -7.07617995e+01, -2.91043041e-02,
          5.28367109e+01, -2.06954441e+00,  1.76037547e-02]]),
 array([0.29282988, 0.26988784, 0.3196303 ]))

### Poloynomial Regression Model: (70% of Cluster 1)
**Returns:**
> MSE, R2 Score, and Coefficients

In [25]:
model.pregression_test(cluster_one, xfeatures, yfeature, 0.70)

(3563.091984412698,
 0.8360858054477588,
 array([[ 7.52019836e+06, -2.32316553e+07, -1.34868463e+06, ...,
          4.94828005e+05, -7.86278691e+00,  3.20503131e+00]]),
 array([0.3269934 , 0.29339431, 0.26202899]))

### Cluster 2: 1864 Observations

In [26]:
cluster_two = df[df['cluster_id'] == 2]

### Linear Regression Model: (70% of Cluster 2)
**Returns:**
> MSE, R2 Score, and Coefficients

In [27]:
model.lregression_test(cluster_two, xfeatures, yfeature, 0.70)

(16679.996868672184,
 0.28261853840011075,
 array([[ 3.09729348e-03,  1.09356503e-02,  4.46935316e-03,
          9.62892664e-03,  1.21463363e+02, -1.06527227e+02,
          3.53478492e+02, -3.63040233e+02, -6.16063722e-01,
          9.45594448e+01, -7.27836856e-01,  2.62502348e-02]]),
 array([0.27300269, 0.27447259, 0.2572935 ]))

### Ridge Regression Model: (70% of Cluster 2)
**Returns:**
> MSE, R2 Score, and Coefficients

In [28]:
model.rregression_test(cluster_two, xfeatures, yfeature, 0.70)

(16622.59533466008,
 0.2850872916434054,
 array([[ 5.46571907e-03,  1.10480498e-02,  3.90338387e-03,
          9.73051965e-03,  3.54706132e+01, -2.03994308e+01,
          5.47239393e+00, -8.35109351e+00, -5.98328064e-01,
          6.95751748e+01, -2.11436249e+00,  2.58815743e-02]]),
 array([0.27666001, 0.27971108, 0.2595882 ]))

### Poloynomial Regression Model: (70% of Cluster 2)
**Returns:**
> MSE, R2 Score, and Coefficients

In [29]:
model.pregression_test(cluster_two, xfeatures, yfeature, 0.70)

(285.4364797765754,
 0.9877238083035476,
 array([[-1.86059487e+07,  1.87777418e+07,  2.61462626e+07, ...,
          5.92446900e+06,  2.74963032e+03, -5.88602834e+01]]),
 array([0.26819784, 0.28668359, 0.28143518]))

## Interpret and report results

# Improve Results

## Algorithm Tuning

## Ensemble Methods

### Bagging

### Boosting

### Blending

## Extreme Feature Engineering