In [12]:
import pandas as pd
import numpy as np

SEED = 42

In [14]:
%%time

dtypes = {
    'CLASS': 'uint8',  # (0, 255)
    'X': 'uint16',  # (0, 65535)
    'Y': 'uint16',
    'WINTER1': 'uint16',
    'WINTER2': 'uint16',
    'SPRING1': 'uint16',
    'SPRING2': 'uint16',
    'SPRING3': 'uint16',
    'SUMMER1': 'uint16',
    'SUMMER2': 'uint16',
    'SUMMER3': 'uint16',
    'FALL1': 'uint16',
    'FALL2': 'uint16',
    'FALL3': 'uint16'
}

train_data = pd.read_csv('data/lc_sample_train.csv', sep=';', dtype=dtypes)
print("Train dataset size:", train_data.shape)


Train dataset size: (2000000, 14)
CPU times: total: 2.11 s
Wall time: 2.19 s


In [15]:
%%time

control_data = pd.read_csv('data/lc_sample_control.csv', sep=';', dtype=dtypes)
print("Control dataset size:", control_data.shape)

Control dataset size: (72029669, 14)
CPU times: total: 1min 15s
Wall time: 1min 26s


In [3]:
X = train_data.loc[:, train_data.columns != 'CLASS']
y = train_data.loc[:, 'CLASS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=SEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((200000, 13), (1800000, 13), (200000,), (1800000,))

In [4]:
rf = RandomForestClassifier(random_state=SEED)

In [5]:
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [6]:
rf_test_pred = rf.predict(X_test)
rf_test_mse = mean_squared_error(y_test, rf_test_pred)
rf_test_accuracy = accuracy_score(y_test, rf_test_pred)
print("RandomForest MSE: {:.3f}".format(rf_test_mse))
print("RandomForest accuracy: {:.3f}".format(rf_test_accuracy))

RandomForest MSE: 3.478
RandomForest accuracy: 0.954


In [7]:
rf2 = RandomForestClassifier(random_state=SEED)

In [8]:
rf2.fit(X, y)

KeyboardInterrupt: 

In [None]:
control_data = pd.read_csv('data/lc_sample_control.csv', sep=';')

In [None]:
X_control = control_data.loc[:, control_data.columns != 'CLASS']
y_control = control_data.loc[:, 'CLASS']

In [None]:
rf2_test_pred = rf2.predict(X_test)
rf2_test_mse = mean_squared_error(y_test, rf2_test_pred)
rf2_test_accuracy = accuracy_score(y_test, rf2_test_pred)
print("RandomForest MSE: {:.3f}".format(rf2_test_mse))
print("RandomForest accuracy: {:.3f}".format(rf2_test_accuracy))

In [14]:
# Convert into ONNX format
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 4]))]
onx = convert_sklearn(rf, initial_types=initial_type)
with open("models/rf.onnx", "wb") as f:
    f.write(onx.SerializeToString())

FileNotFoundError: [Errno 2] No such file or directory: 'models/rf.onnx'

In [15]:
onx

In [None]:
# Compute the prediction with ONNX Runtime
import onnxruntime as rt
import numpy as np
sess = rt.InferenceSession("rf.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name], {input_name: X_test.astype(np.float32)})[0]