In [1]:
import pandas as pd

Bedforms = pd.read_csv('RandomForest/BedformData.csv')
y = Bedforms['Y/N'] #results data that will be used to compare to the trained output
X = Bedforms[['Topo', 'Bed', 'Elong', 'Area']] #input data that will be used to train the results

In [2]:
#we are changing all string inputs to binarized integers so they can be processed by the model.
X['Topo'][X['Topo'] == 'O']=1
X['Topo'][X['Topo'] == 'V']=0

X['Bed'][X['Bed'] == 'C']=1
X['Bed'][X['Bed'] == 'S']=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Topo'][X['Topo'] == 'O']=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Topo'][X['Topo'] == 'V']=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Bed'][X['Bed'] == 'C']=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Bed'][X['Bed'] == 'S']=0


In [7]:
X.dtypes

Topo     int8
Bed      int8
Elong    int8
Area     int8
dtype: object

In [3]:
X = X.astype('int8') #change all columns of the input dataset to integers from string

In [4]:
from sklearn.ensemble import RandomForestClassifier
#adding controls to get a better fit
#n_estimators - how many "trees" are in the "forest", default was 100 making it wider to 1000 is approx 2%, increasing testing tuning
#min_samples_leaf - control of number of times it runs-- average of more runs, changing from 8 to 50
#random_state - reproducibility, not the modeling or stats aspect
rf_model = RandomForestClassifier(n_estimators=100, min_samples_leaf=5, min_samples_split=150, random_state=42, n_jobs=-1) #using all 4 cpus
rf_model.fit(X, y)

In [5]:
#find random selection of the data to test the model, try using 10% 
#these are the same rows for X and y because of random_state bits chosing the same columns (this is how it can be reproduced)
X_test = X.sample(frac = 0.2, random_state=42)
y_test = y.sample(frac = 0.2, random_state=42)

In [6]:
y_est = rf_model.predict(X_test) #same this as saying y_pred

In [7]:
#using the 10% of data we selected, we're determining how many predictions are right out of the number it makes
import matplotlib.pyplot as plt
print(f'Prediction Count: {len(y_est)}')
print(f'Correct Predictions: {len(y_est[y_est == y_test])}')

Prediction Count: 117159
Correct Predictions: 114180


In [None]:
#calculate the percentage of correct predictions out of total predictions for 10% random data selection
114180/117159

In [5]:
#cross-validate a grid, it'll run all options and determine the best one
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
#coarse-grid validation
param_grid = {
    'max_features': ['sqrt'],
    'min_samples_leaf': [2, 5, 10],
    'min_samples_split': [5, 10, 25, 50, 75, 100],
    'n_estimators': [100, 500, 1000, 2500]}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid,
                           cv=KFold(n_splits=5, shuffle=True, random_state=42), verbose=2) #changing when it updates me

grid_search.fit(X, y)

#after this, depending on the best fit, we will run a higher resolution grid validation

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  32.5s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  33.4s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  33.1s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  34.2s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  33.1s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time= 2.7min
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time= 2.7min
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time= 2.7min
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=500; tot

In [7]:
import joblib
joblib.dump(grid_search.best_estimator_, 'RandomForest.pkl')

['RandomForest.pkl']

In [9]:
grid_search.best_estimator_

In [3]:
import joblib
grid_search = joblib.load('RandomForest.pkl')
grid_search.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 5,
 'min_samples_split': 100,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [9]:
#cross-validate a grid, it'll run all options and determine the best one
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
#coarse-grid validation
param_grid = {
    'max_features': ['sqrt'],
    'min_samples_leaf': [2, 5, 10],
    'min_samples_split': [100, 150, 200, 500],
    'n_estimators': [10, 50, 75, 100]}
grid_search_small = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid,
                           cv=KFold(n_splits=5, shuffle=True, random_state=42), verbose=2) #changing when it updates me

grid_search_small.fit(X, y)

#after this, depending on the best fit, we will run a higher resolution grid validation

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=10; total time=   3.7s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=10; total time=   3.8s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=10; total time=   3.6s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=10; total time=   3.5s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=10; total time=   3.8s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=50; total time=  17.8s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=50; total time=  17.9s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimators=50; total time=  17.8s
[CV] END max_features=sqrt, min_samples_leaf=2, min_samples_split=100, n_estimator

In [11]:
import joblib
joblib.dump(grid_search_small.best_estimator_, 'RandomForest_small.pkl')

['RandomForest_small.pkl']

In [12]:
grid_search_small = joblib.load('RandomForest_small.pkl')
grid_search_small.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 5,
 'min_samples_split': 150,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [None]:
#find random selection of the data to test the model, try using 10% 
#these are the same rows for X and y because of random_state bits chosing the same columns (this is how it can be reproduced)
X_test = X.sample(frac = 0.2, random_state=42)
y_test = y.sample(frac = 0.2, random_state=42)

In [None]:
y_est = rf_model.predict(X_test) #same this as saying y_pred

In [None]:
#using the 10% of data we selected, we're determining how many predictions are right out of the number it makes
import matplotlib.pyplot as plt
print(f'Prediction Count: {len(y_est)}')
print(f'Correct Predictions: {len(y_est[y_est == y_test])}')

In [13]:
pip install session_info

Collecting session_info
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting stdlib_list
  Downloading stdlib_list-0.10.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: session_info
  Building wheel for session_info (setup.py) ... [?25ldone
[?25h  Created wheel for session_info: filename=session_info-1.0.0-py3-none-any.whl size=8026 sha256=f962515776d9e85cefdccc92ddef0faaf914e8404b03b2ca4fe10a64ff81149f
  Stored in directory: /home/jovyan/.cache/pip/wheels/eb/4a/d2/ce798b0ff7bd8cc299e4e0eee863d4f792eddbc3c90af1adec
Successfully built session_info
Installing collected packages: stdlib_list, session_info
Successfully installed session_info-1.0.0 stdlib_list-0.10.0
Note: you may need to restart the kernel to use updated packages.


In [14]:
import session_info
session_info.show()