# Training EUR predictive algorithm

This notebook will take a dataframe that holds a training dataset, with additional features (from the completions data) merged, and then build a regression algorithm to predict EUR or EUR per foot (whichever ends up being more accurate).

We'll start with loading the packages and the data:

In [4]:
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,make_scorer
import pandas as pd
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_columns', None)
base_merged_train = pd.read_csv('base_train_merged.csv')
base_merged_train['Acid..Gals.'].fillna(0,inplace=True)
# Drop columns that might be correlated / carrying similar information as others
base_merged_train.drop(['Operator','County','Surface.Latitude','Surface.Longitude','Treatment.Records','Completion.Year','WellID'],axis=1,inplace=True)
base_merged_train.head()



Unnamed: 0.1,Unnamed: 0,Subarea,Completion.Date,Depth.Total.Driller..ft.,WB.Spacing.Proxy,SPBY.Spacing.Proxy,Deepest_Zone,Between_Zone,Frac.Stages,Fluid.Water..Gals.,Acid..Gals.,Gel.x.link..Gals.,Other..Gals.,Proppant...Total..lbs.,Fluid...Total..Gals.,EUR_o..Mstb.,Length,NStages,TotalFluid_Comp,Total_Proppant,Nrecords
0,308,B,37625,10500,160,40,WFMP,WFMP --> WFMP,3.0,256614.0,2000.0,0.0,0.0,96126.0,258614.0,53.487248,2754,10,1722010.0,896068.0,20
1,307,A,37628,10950,80,80,ATOKA,SPBR_U --> ATOKA,1.0,297696.0,0.0,0.0,0.0,371330.0,297696.0,41.692,2633,1,1042146.0,1024000.0,1
2,1362,C,37629,8805,Over 160,40,WFMP,CLFK --> WFMP,3.0,0.0,3500.0,176500.0,0.0,473330.0,180000.0,28.968,2155,1,974232.0,1120000.0,2
3,1363,C,37629,8750,160,40,WFMP,CLFK --> WFMP,3.0,0.0,5000.0,201474.0,0.0,471200.0,206474.0,18.522,2257,1,674604.0,929300.0,2
4,298,C,37629,8970,80,20,WFMP,SPBR_U --> WFMP,4.0,0.0,3200.0,191981.0,0.0,472170.0,195181.0,28.543885,2717,9,923000.0,738188.0,9


Here as we use sklearn, we need to transform categorical values to dummies using one hot encoding.



In [6]:
def dummy_val(train,test,columns = ['WB.Spacing.Proxy','Deepest_Zone','Subarea','SPBY.Spacing.Proxy','Between_Zone']):
    for column in columns:
        train[column] = train[column].apply(lambda x: str(x))
        test[column] = test[column].apply(lambda x: str(x))
        good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
        train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
        test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
        del train[column]
        del test[column]
    return train, test

In [10]:
# Prep RF and the CV objects:

grid = {      "min_samples_leaf" : [1, 5, 10],
              "min_samples_split" : [2, 4, 10, 12, 16],
              "n_estimators": [50, 150, 300]}
RFmodel = RandomForestRegressor(max_features='auto', oob_score=True, random_state=1)
gs_cv = GridSearchCV(RFmodel,scoring=make_scorer(mean_absolute_error),param_grid=grid,n_jobs=-1,cv=3,verbose=1)
Xtrain,_ = dummy_val(base_merged_train,base_merged_train)

# Crossvalidation techniques

Create instances of both spatial stratified CV and randomly picked CV.
## Stratification by Subarea


## Random folds




In [11]:
Xtrain.dropna(inplace=True)
Xtrain.shape

(1524, 73)

In [12]:
gs_cv.fit(Xtrain.drop(['EUR_o..Mstb.'],axis=1),Xtrain['EUR_o..Mstb.'])

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  1.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=True, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'min_samples_leaf': [1, 5, 10], 'min_samples_split': [2, 4, 10, 12, 16], 'n_estimators': [50, 150, 300]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(mean_absolute_error), verbose=1)

In [13]:
gs_cv.cv_results_



{'mean_fit_time': array([1.44528953, 4.45489812, 8.88123933, 1.25195297, 4.01079679,
        7.63383762, 1.09422135, 3.28987805, 6.73893181, 1.08870212,
        3.25173306, 6.49454538, 1.02578036, 3.14723913, 6.13187257,
        0.86906854, 2.61129093, 5.25324694, 0.85106222, 2.65863546,
        5.26141946, 0.87254063, 2.62610936, 5.15257319, 0.85209195,
        2.5858713 , 5.20077705, 0.80990839, 2.57879965, 4.99641005,
        0.70642344, 2.17699973, 4.23031219, 0.72793841, 2.13292885,
        4.20858955, 0.71635532, 2.14147298, 4.26632428, 0.70118014,
        2.1456333 , 4.20030642, 0.7092003 , 2.15500053, 3.57714947]),
 'std_fit_time': array([0.08128029, 0.42568325, 0.75029393, 0.11417068, 0.43096417,
        0.69423585, 0.12308422, 0.3532583 , 0.71817013, 0.13581866,
        0.31302457, 0.69838356, 0.10255781, 0.34993141, 0.72341203,
        0.06407005, 0.18573821, 0.38877267, 0.05297531, 0.22340019,
        0.3734479 , 0.0676234 , 0.14420781, 0.39341831, 0.05509228,
        0.194