In [1]:
# Modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Feature Selection
import xgboost as xgb
from sklearn.decomposition import PCA

# Evaluate Algorithms

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
from acquire_prepare import acquire_oil
from acquire_prepare import prep_data

import model
import explore
import numpy as np
import pandas as pd

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import sys 

**Let's bring in the DataFrame**

In [3]:
df = acquire_oil()
df = prep_data(df)

**(Rows, Columns)**

In [4]:
df.shape

(7888, 62)

**We need to make sure the numberic types are correct**

In [5]:
numerics = ['int64', 'float64']

num_df = df.select_dtypes(include=numerics)

In [6]:
num_df = num_df.drop(columns=['gas_hist', 'recovery', 'recovery_per_foot', 
                              'recovery_per_month', 'api14',
                              'sur_long', 'sur_lat', 'mid_point_lat', 'mid_point_long',
                              'well_id', 'oil_hist'])

### Use LabelEncoder to encode county and frak_fluid_type

In [7]:
encoder = LabelEncoder()

encoder.fit(df.county)
num_df['county'] = encoder.transform(df.county)

In [8]:
encoder = LabelEncoder()
encoder.fit(df.frac_fluid_type)
num_df['frac_fluid_type'] = encoder.transform(df.frac_fluid_type)

### Standardize Numeric Data

In [9]:
scaled_df = num_df.copy()

scaled_df['proppant_ppf'] = (scaled_df.proppant_ppf - scaled_df.proppant_ppf.min()) / (scaled_df.proppant_ppf.max() - scaled_df.proppant_ppf.min())
scaled_df['frac_fluid_gpf'] = (scaled_df.frac_fluid_gpf - scaled_df.frac_fluid_gpf.min()) / (scaled_df.frac_fluid_gpf.max() - scaled_df.frac_fluid_gpf.min())
scaled_df['tvd'] = (scaled_df.tvd - scaled_df.tvd.min()) / (scaled_df.tvd.max() - scaled_df.tvd.min())
scaled_df['frac_stages'] = (scaled_df.frac_stages - scaled_df.frac_stages.min()) / (scaled_df.frac_stages.max() - scaled_df.frac_stages.min())
scaled_df['oil_gravity'] = (scaled_df.oil_gravity - scaled_df.oil_gravity.min()) / (scaled_df.oil_gravity.max() - scaled_df.oil_gravity.min())
scaled_df['peak_boepd'] = (scaled_df.peak_boepd - scaled_df.peak_boepd.min()) / (scaled_df.peak_boepd.max() - scaled_df.peak_boepd.min())
scaled_df['gor_hist'] = (scaled_df.gor_hist - scaled_df.gor_hist.min()) / (scaled_df.gor_hist.max() - scaled_df.gor_hist.min())
scaled_df['ip90_boeqpd'] = (scaled_df.ip90_boeqpd - scaled_df.ip90_boeqpd.min()) / (scaled_df.ip90_boeqpd.max() - scaled_df.ip90_boeqpd.min())
scaled_df['months_active'] = (scaled_df.months_active - scaled_df.months_active.min()) / (scaled_df.months_active.max() - scaled_df.months_active.min())
scaled_df['vintage'] = (scaled_df.vintage - scaled_df.vintage.min()) / (scaled_df.vintage.max() - scaled_df.vintage.min())

In [10]:
xgb_df = scaled_df.copy()

In [None]:
xgb_params = {
    'max_depth': 8,
    'seed' : 493
}

dtrain = xgb.DMatrix(xgb_df, df.oil_hist, feature_names=xgb_df.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
ranked_features = explore.xgb_rank(df, df.recovery)

# Test Harness and Options

In [11]:
xfeatures = ['scaled_oil_gravity', 'scaled_tvd', 'scaled_vintage', 'scaled_gross_perfs', 
             'scaled_mid_point_lat', 'scaled_mid_point_long', 'scaled_formation', 'scaled_frac_stages', 
             'scaled_frac_fluid_gpf']
yfeature = ['recovery']

### Linear Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [12]:
model.lregression_test(df, xfeatures, yfeature, 0.70)

(13905.018992080732,
 0.2854400675833697,
 array([[250.30024096, 115.2345652 , -70.54714336, 203.52419749,
          96.46689657, -29.0502284 ,  51.72282918, -33.76938055,
          25.00698429]]),
 array([0.29456   , 0.28519117, 0.26362922]))

### Ridge Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [13]:
model.rregression_test(df, xfeatures, yfeature, 0.70)

(13659.085346125445,
 0.29807826171549356,
 array([[ 230.02460689,  105.73265431, -186.29647727,  238.98593132,
           48.33557457,  -89.40195877,   48.0666349 ,  -36.0662514 ,
           33.84206358]]),
 array([0.3090503 , 0.29118106, 0.28010626]))

### Poloynomial Regression Model: (DataFrame)
**Returns:**
> MSE, R2 Score, and Coefficients

In [14]:
model.pregression_test(df, xfeatures, yfeature, 0.70)

(8136.3457908022165,
 0.5818843036672487,
 array([[-1.49932842e+13,  7.48228818e+12, -6.60678205e+10, ...,
         -7.25494385e-01,  4.48397064e+00,  2.70544434e+00]]),
 array([0.34339018, 0.31443862, 0.31580838]))

## Explore and select algorithms

**Let's Cluster by our Top Seven features per our XGBoost function**

In [15]:
col_list = ['scaled_proppant_ppf', 'scaled_gross_perfs', 'scaled_tvd', 'scaled_frac_fluid_gpf', 
            'scaled_gor_hist', 'encoded_oper', 'scaled_vintage']

**We now have a new dataframe with an added column "cluster_id"**

In [16]:
df = model.create_kclusters(df, col_list, 3, 'cluster_id')

**Let's see how many oberservations we have in each cluster**

In [17]:
df.cluster_id.value_counts()

1    3693
0    2331
2    1864
Name: cluster_id, dtype: int64

# ALL Regression Types on Each Cluster

### Cluster 0: 2331 Oberservations 

In [18]:
cluster_zero = df[df['cluster_id'] == 0]

### Linear Regression Model: (70% of Cluster 0)
**Returns:**
> MSE, R2 Score, and Coefficients

In [19]:
model.lregression_test(cluster_zero, xfeatures, yfeature, 0.70)

(16658.329117479334,
 0.28355043563862303,
 array([[ 166.77021447,   69.37454928, -120.30411019,  294.3857654 ,
           48.33148693,   51.21388778,   53.51133605,  -38.15422814,
           31.39166919]]),
 array([0.2713808 , 0.30295634, 0.25629123]))

### Ridge Regression Model: (70% of Cluster 0)
**Returns:**
> MSE, R2 Score, and Coefficients

In [20]:
model.rregression_test(cluster_zero, xfeatures, yfeature, 0.70)

(16600.855132084085,
 0.28602230490643166,
 array([[ 155.11574089,   70.86248596, -169.00396033,  307.971811  ,
           18.02654085,    8.69125987,   46.42465947,  -43.96357454,
           37.47291304]]),
 array([0.2756961 , 0.30412149, 0.25300147]))

### Poloynomial Regression Model: (70% of Cluster 0)
**Returns:**
> MSE, R2 Score, and Coefficients

In [21]:
model.pregression_test(cluster_zero, xfeatures, yfeature, 0.70)

(7831.570041683531,
 0.6631760061252292,
 array([[-1.79516411e+13,  8.70944899e+12, -1.01091237e+13, ...,
          1.24839105e+08,  1.06718182e+02,  6.15136719e+00]]),
 array([0.31324604, 0.30346986, 0.21414691]))

### Cluster 1: 3693 Observations

In [22]:
cluster_one = df[df['cluster_id'] == 1]

### Linear Regression Model: (70% of Cluster 1)
**Returns:**
> MSE, R2 Score, and Coefficients

In [23]:
model.lregression_test(cluster_one, xfeatures, yfeature, 0.70)

(15517.51151246459,
 0.28614236956332384,
 array([[203.96071471, 114.12346621, -27.40984845, 175.59418369,
         127.97764385, -81.34020241,  47.89459981,  -3.45176898,
          21.52889006]]),
 array([0.28026089, 0.25500495, 0.30896395]))

### Ridge Regression Model: (70% of Cluster 1)
**Returns:**
> MSE, R2 Score, and Coefficients

In [24]:
model.rregression_test(cluster_one, xfeatures, yfeature, 0.70)

(15130.302473658707,
 0.30395528542316386,
 array([[ 174.09512607,   99.25218486, -161.62082258,  210.18168241,
           56.73803502, -156.28715461,   43.68469977,   -3.19092141,
           35.14941311]]),
 array([0.2931113 , 0.27726003, 0.32629665]))

### Poloynomial Regression Model: (70% of Cluster 1)
**Returns:**
> MSE, R2 Score, and Coefficients

In [25]:
model.pregression_test(cluster_one, xfeatures, yfeature, 0.70)

(7454.900910249445,
 0.6570495279055439,
 array([[-6.10567203e+12,  2.68875622e+12, -9.60536542e+11, ...,
          7.76449022e+07, -6.52931595e+00,  3.25341797e+00]]),
 array([0.32574976, 0.286108  , 0.33658491]))

### Cluster 2: 1864 Observations

In [26]:
cluster_two = df[df['cluster_id'] == 2]

### Linear Regression Model: (70% of Cluster 2)
**Returns:**
> MSE, R2 Score, and Coefficients

In [27]:
model.lregression_test(cluster_two, xfeatures, yfeature, 0.70)

(7963.078638001417,
 0.2949634007672203,
 array([[ 256.12024823,  134.12094347, -103.75267652,  155.43182392,
           54.8408612 ,    4.75583044,   88.94667919,   -3.22715964,
           16.01141914]]),
 array([0.34616891, 0.18726556, 0.32370302]))

### Ridge Regression Model: (70% of Cluster 2)
**Returns:**
> MSE, R2 Score, and Coefficients

In [28]:
model.rregression_test(cluster_two, xfeatures, yfeature, 0.70)

(7862.457348703379,
 0.30387222797364255,
 array([[ 2.43490480e+02,  1.15378450e+02, -1.61439322e+02,
          1.77097036e+02,  2.65562905e+01, -3.19128447e+01,
          8.23030420e+01, -1.10132743e-01,  2.16694581e+01]]),
 array([0.3557353, 0.1997241, 0.3276788]))

### Poloynomial Regression Model: (70% of Cluster 2)
**Returns:**
> MSE, R2 Score, and Coefficients

In [29]:
model.pregression_test(cluster_two, xfeatures, yfeature, 0.70)

(2972.869567901953,
 0.736787498227913,
 array([[ 1.02887107e+11,  6.07191650e+11,  7.61203274e+12, ...,
          3.61841973e+07, -2.61620331e+01,  7.22473145e+00]]),
 array([-2.55619409e+01,  2.48350297e-02,  1.98969820e-01]))

## Interpret and report results

# Improve Results

## Algorithm Tuning

## Ensemble Methods

### Bagging

### Boosting

### Blending

## Extreme Feature Engineering