In [None]:
# Modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import seaborn as sns
from explore import xgb_rank

# Feature Selection
from explore import get_scaled_df
import xgboost as xgb
from sklearn.decomposition import PCA

# Evaluate Algorithms

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
from acquire_prepare import acquire_oil
from acquire_prepare import prep_data

import model
from model import polynomial_regression_model
from model import run_models

import numpy as np
import pandas as pd

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import sys 

**Let's bring in the DataFrame**

In [None]:
df = acquire_oil()
df = prep_data(df)

**(Rows, Columns)**

In [None]:
df.shape

In [None]:
df.columns

### XGBoost Feature Recommender

In [None]:
feature_list, scaled_features, importance_df = xgb_rank(df,df.recovery,feature_percent=80,mode='gain')
feature_list

In [None]:
scaled_features

In [None]:
xgb_df = get_scaled_df(df).drop(columns=['recovery'])
xgb_df = xgb_df[feature_list]

xgb_params = {
    'max_depth': 8,
    'seed' : 493
}

dtrain = xgb.DMatrix(xgb_df, df.oil_hist, feature_names=xgb_df.columns.values)
xgb_model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(xgb_model, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
importance_df

In [None]:
sorted_list = ['tvd', 'vintage', 'gross_perfs', 'mid_point_lat', 'mid_point_long', 'formation', 'direction']

In [None]:
plt.xticks(rotation=90)
plt.bar(importance_df.feature[:7], importance_df.gain[:7])

# Test Harness and Options

### Results for the entire dataframe to use as a baseline before clustering

In [None]:
feature_list

In [None]:
xfeatures = ['tvd', 'vintage', 'gross_perfs', 'mid_point_lat', 'mid_point_long', 'encoded_formation', 'encoded_direction'] 
yfeature = ['recovery']

In [None]:
model.run_models(df, xfeatures, yfeature, 0.80)

**Let's Cluster by our Top Seven features per our XGBoost function**

In [None]:
feature_list

In [None]:
scaled_features

#### Had to use encoded_direction:

In [None]:
col_list = scaled_features

**We now have a new dataframe with an added column "cluster_id"**

In [None]:
df = model.create_kclusters(df, col_list, 3, 'cluster_id')

**Let's see how many oberservations we have in each cluster**

In [None]:
df.cluster_id.value_counts()

**Let's take a look at the differences between the clusters**

In [None]:
df.groupby('cluster_id')['tvd', 'vintage', 'gross_perfs', 'mid_point_long', 'mid_point_lat', 'formation', 'direction'].describe().T

In [None]:
df.groupby('cluster_id')['direction'].value_counts()

**A few keep takeaways:**

**Cluster 1 (verticals) contains all, and solely, vertical wells**

**Cluster 0 (short_horizontals) contains horizontal wells that tend to be shorter in lateral length/gross perfs**

**Cluster 2 (long_horizontals) contains horizontal wells that tend to be longer in lateral length/gross perfs**

In [None]:
df.to_excel('CapstoneDataFinal.xlsx')

# ALL Regression Types on Each Cluster

### Cluster 0: 4,019 Oberservations -- observation count did not change

In [None]:
short_horizontals = df[df['cluster_id'] == 0]

In [None]:
short_horizontals.shape

In [None]:
feature_list, scaled_features, importance_df = xgb_rank(short_horizontals,short_horizontals.recovery,feature_percent=80,mode='gain')
feature_list

#### Had to use encoded_formation.

In [None]:
xfeatures = ['vintage', 'gross_perfs', 'mid_point_lat', 'mid_point_long', 'encoded_oper', 'tvd', 'gor_hist']
yfeature = ['recovery']

In [None]:
model.run_models(short_horizontals, xfeatures, yfeature, 0.70)

### Cluster 1:  1,224 Observations, down from 1,714 Observations

In [None]:
verticals = df[df['cluster_id'] == 1]

In [None]:
verticals.shape

In [None]:
feature_list, scaled_features, importance_df = xgb_rank(verticals, verticals.recovery, feature_percent=80,mode='gain')
feature_list

#### Had to use encoded_lateral_class and encoded_formation.

In [None]:
xfeatures = ['vintage', 'gor_hist', 'gross_perfs', 'tvd', 'encoded_formation']
yfeature = ['recovery']

In [None]:
model.run_models(verticals, xfeatures, yfeature, 0.70)

### Cluster 2: 2,565 Observations, up from 1,232 Observations

In [None]:
long_horizontals = df[df['cluster_id'] == 2]

In [None]:
long_horizontals.shape

In [None]:
feature_list, scaled_features, importance_df = xgb_rank(long_horizontals,long_horizontals.recovery,feature_percent=80,mode='gain')
feature_list

#### Did not have to encode any of these features.

In [None]:
xfeatures = ['vintage', 'mid_point_lat', 'encoded_formation', 'gross_perfs', 'mid_point_long', 'tvd', 'frac_fluid_gpf']
yfeature = ['recovery']

In [None]:
model.run_models(long_horizontals, xfeatures, yfeature, 0.70)

## Interpret and report results

# Improve Results

## Algorithm Tuning

## Ensemble Methods

### Bagging

### Boosting

### Blending

## Extreme Feature Engineering