In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# Wrangling and load data
from acquire_prepare import acquire_oil
from acquire_prepare import prep_data

# feature selection
import explore
from explore import xgb_rank
from explore import merge_clusters2origdf

# model
import model
from model import get_scaled_df
from model import create_kclusters
from model import run_models
# temporary - will take these out when I move function to explore.py
from model import filter_columns

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import sys 

import seaborn as sns


**Bring in the DataFrame, using prep_data to select whatever subset you want to inspect**

In [2]:
df = acquire_oil()

# different sub-group options
# horizontals only,   df = prep_data(df,direc='horizontal')
# single well leases only, df = prep_data(df,direc='horizontal')
# specific cluster, df = prep_data(df,clusterid=1) 

# everything in database - use this one when clustering,  df = prep_data(df)
df = prep_data(df,clusterid=0)
df.shape

TypeError: unorderable types: Interval() > str()

# Feature Selection

**XGBoost creates a ranked feature list used for feature selection in the models**
Let XGBoost reselect top features, regardless of which subgroup is being run

In [None]:
ranked_featurelist, ranked_scaledlist, allfeatures_rankdf = xgb_rank(df, df.recovery)

In [None]:
xfeatures = ranked_featurelist
yfeature = ['recovery']
scaled_df = get_scaled_df(df)

### Model Results - specific to the chosen sub-group and set of features

In [None]:
model.run_models(scaled_df, xfeatures, yfeature, 0.70)

**Now, build a new dataframe with an added column "clusterid"**

In [None]:
def merge_clusters2origdf(df, cluster_df, origxl='CapstoneData.xlsx'):
    # ''' pass in the dataframe being run, plus the dataframe returned from create_kclusters
    # this purpose of this function is to take the clusterid and write them back to the original CapstoneData.xlsx
    # '''
    api_df = df[['api14']]
    cluster_id = cluster_df[['clusterid']]
    # merge df and cluster_df together
    id_cluster = pd.concat([api_df,cluster_id], axis=1, join_axes=[api_df.index]) 
    orig_df = pd.read_excel(origxl).infer_objects()
    # clear existing clusterid column
    orig_df = orig_df.drop(columns=['clusterid'])
    orig_df.rename(columns={'API14': 'api14'}, inplace=True)
    xcel_df = orig_df.merge(id_cluster, how='left', left_on='api14', right_on='api14')  
# now overwrite xcel_df on top of 'CapstoneData.xlsx' and we should be back to the original     
# excel file, except it has the cluster ID in it (and observations not included in set have clusterids that are cleared)
    xcel_df.to_excel(origxl, index=False)
    print('clusterid column appended to ',origxl)
    return xcel_df

In [None]:
cluster_df = model.create_kclusters(scaled_df, ranked_featurelist, 3, 'clusterid')

**Let's see how many oberservations we have in each cluster**

In [None]:
df.cluster_id.value_counts()

In [None]:
df.to_excel('CapstoneDataFinal.xlsx')

# ALL Regression Types on Each Cluster

### Cluster 0: 2331 Oberservations 

In [None]:
cluster_zero = df[df['cluster_id'] == 0]

In [None]:
model.run_models(cluster_zero, xfeatures, yfeature, 0.70)

### Cluster 1: 3693 Observations

In [None]:
cluster_one = df[df['cluster_id'] == 1]

In [None]:
model.run_models(cluster_one, xfeatures, yfeature, 0.70)

### Cluster 2: 1864 Observations

In [None]:
cluster_two = df[df['cluster_id'] == 2]

In [None]:
model.run_models(cluster_two, xfeatures, yfeature, 0.70)

## Interpret and report results

# Improve Results

## Algorithm Tuning

## Ensemble Methods

### Bagging

### Boosting

### Blending

## Extreme Feature Engineering