# Install some further dependencies

In [2]:
!pip install onnxmltools

Collecting onnxmltools
[?25l  Downloading https://files.pythonhosted.org/packages/e5/dd/0530dbe2d76a3c2dacc5ecc7f3561ffeae65967cf350fde776ef4f99c10d/onnxmltools-1.2.2.129-py2.py3-none-any.whl (244kB)
[K    100% |████████████████████████████████| 245kB 7.5MB/s ta 0:00:01
Installing collected packages: onnxmltools
Successfully installed onnxmltools-1.2.2.129
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [10]:
!pip install lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/78/7e/bc87e7951cfaa998cffaf39e6c721f5bd04efb2e139486206356edb289a5/lightgbm-2.2.1-py2.py3-none-manylinux1_x86_64.whl (1.1MB)
[K    100% |████████████████████████████████| 1.1MB 25.0MB/s ta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.1
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Imports

In [11]:
import io

import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from onnxmltools import convert_sklearn
from onnxmltools.convert.common.data_types import Int64TensorType
import onnxmltools

# Retrieve the data

In [4]:
s3_client = boto3.client('s3')
data_bucket_name='datadan'

In [5]:
response = s3_client.get_object(Bucket=data_bucket_name, Key='practical-ai/section3/grad_school.csv')
response_body = response["Body"].read()
data = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)
data.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


# Pre-process data

In [6]:
# scale the loan data
X = data[['gre','gpa','rank']]
X = MinMaxScaler().fit_transform(X)
data_scaled = pd.DataFrame(X, columns=['gre','gpa','rank']).join(data['admit'])

# split the data into training and test sets
train, test = train_test_split(data_scaled, test_size=0.2)

train.head()

Unnamed: 0,gre,gpa,rank,admit
187,0.62069,0.356322,0.333333,0
281,0.241379,0.58046,0.666667,0
328,0.62069,0.62069,0.333333,0
125,0.551724,0.643678,1.0,0
369,1.0,0.936782,0.333333,0


In [7]:
print(len(train))
print(len(test))

320
80


# Hyperparameter tuning for a random forest, model export

## Hyperparameter tuning

In [8]:
# evaluate various numbers of trees in the random forest
acc_vs_trees = []
for num_trees in range(2,101):

    # define our random forest model
    clf = RandomForestClassifier(n_estimators=num_trees)

    # "fit" our model
    clf.fit(train[['gre','gpa','rank']], train['admit'])

    # calculate our prediction on the test set
    predictions = clf.predict(test[['gre','gpa','rank']])

    # calculate our accuracy
    acc = accuracy_score(test['admit'], predictions)
    acc_vs_trees.append([num_trees, acc])
    
# determine the best number of trees
acc_df = pd.DataFrame(acc_vs_trees, columns=['num_trees', 'accuracy'])
acc_df[acc_df['accuracy'] == acc_df['accuracy'].max()]

Unnamed: 0,num_trees,accuracy
4,6,0.7125
12,14,0.7125
33,35,0.7125
67,69,0.7125
73,75,0.7125


## Export the best scikit-learn random forest model

In [12]:
# define our random forest model
clf = RandomForestClassifier(n_estimators=6)

# "fit" our model
clf.fit(train[['gre','gpa','rank']], train['admit'])

# convert the model to ONNX
model_onnx = convert_sklearn(clf, 'tree-based binary classifier', [('input', Int64TensorType([1, 3]))])

# export the model
onnxmltools.utils.save_model(model_onnx, 'random_forest.onnx')

simple model: <class 'sklearn.ensemble.forest.RandomForestClassifier'> 
