# Install some further dependencies

In [13]:
!pip install onnxmltools

Collecting onnxmltools
  Using cached https://files.pythonhosted.org/packages/e5/dd/0530dbe2d76a3c2dacc5ecc7f3561ffeae65967cf350fde776ef4f99c10d/onnxmltools-1.2.2.129-py2.py3-none-any.whl
Collecting onnx (from onnxmltools)
  Using cached https://files.pythonhosted.org/packages/5b/dc/a618db796e343aeacdf6dd0ffc672d937531bd2eba29733137df595bdfd3/onnx-1.3.0-cp36-cp36m-manylinux1_x86_64.whl
Installing collected packages: onnx, onnxmltools
Successfully installed onnx-1.3.0 onnxmltools-1.2.2.129
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Imports

In [22]:
import io

import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from onnxmltools import convert_sklearn
from onnxmltools.convert.common.data_types import Int64TensorType
import onnxmltools

# Retrieve the data

In [15]:
s3_client = boto3.client('s3')
data_bucket_name='datadan'

In [16]:
response = s3_client.get_object(Bucket=data_bucket_name, Key='practical-ai/section3/grad_school.csv')
response_body = response["Body"].read()
data = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)
data.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


# Pre-process data

In [17]:
# scale the loan data
X = data[['gre','gpa','rank']]
X = MinMaxScaler().fit_transform(X)
data_scaled = pd.DataFrame(X, columns=['gre','gpa','rank']).join(data['admit'])

# split the data into training and test sets
train, test = train_test_split(data_scaled, test_size=0.2)

train.head()

Unnamed: 0,gre,gpa,rank,admit
62,0.724138,0.810345,0.666667,0
391,0.758621,0.931034,0.333333,1
186,0.586207,0.775862,0.666667,0
59,0.655172,0.321839,1.0,0
251,0.689655,0.545977,1.0,0


In [18]:
print(len(train))
print(len(test))

320
80


# Hyperparameter tuning for a random forest, model export

## Hyperparameter tuning

In [20]:
# evaluate various numbers of trees in the random forest
acc_vs_trees = []
for num_trees in range(2,101):

    # define our random forest model
    clf = RandomForestClassifier(n_estimators=num_trees)

    # "fit" our model
    clf.fit(train[['gre','gpa','rank']], train['admit'])

    # calculate our prediction on the test set
    predictions = clf.predict(test[['gre','gpa','rank']])

    # calculate our accuracy
    acc = accuracy_score(test['admit'], predictions)
    acc_vs_trees.append([num_trees, acc])
    
# determine the best number of trees
acc_df = pd.DataFrame(acc_vs_trees, columns=['num_trees', 'accuracy'])
acc_df[acc_df['accuracy'] == acc_df['accuracy'].min()]

Unnamed: 0,num_trees,accuracy
7,9,0.6125


## Export the best scikit-learn random forest model

In [24]:
# define our random forest model
clf = RandomForestClassifier(n_estimators=9)

# "fit" our model
clf.fit(train[['gre','gpa','rank']], train['admit'])

# convert the model to ONNX
model_onnx = convert_sklearn(clf, 'tree-based binary classifier', [('input', Int64TensorType([1, 2]))])

# export the model
onnxmltools.utils.save_model(model_onnx, 'random_forest.onnx')

simple model: <class 'sklearn.ensemble.forest.RandomForestClassifier'> 
