In [1]:
import os
import numpy as np
import pandas as pd
import evalml
from sklearn.metrics import accuracy_score
from moodlemlbackend.processor.base import BaseEstimater

# Load the data we are going to test with
breat_cancer_path = os.path.join(os.path.abspath(''), 'test_data/breast_cancer.csv')
print(breat_cancer_path)

# In moodle data exported the first two lines have meta data in them
with open(breat_cancer_path) as fp:
    head = [next(fp) for x in range(3)]

print("------- Metadata from Moodle LMS ---------\n")
print("\n".join(head))
print("------------------------------------------\n")

# Get the data imported and split into test and training
X, y = BaseEstimater.get_labelled_samples(breat_cancer_path)
X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X, y, test_size=0.2, random_state=0)

pd.DataFrame(X).head
pd.DataFrame(y).head


/Users/doug.morgan/git/moodle-mlbackend-python/test_data/breast_cancer.csv
------- Metadata from Moodle LMS ---------

nfeatures,targetclasses,targettype

30,"[0,1]","discrete"



------------------------------------------



<bound method NDFrame.head of      0
0    1
1    1
2    0
3    1
4    1
..  ..
564  0
565  1
566  1
567  1
568  1

[569 rows x 1 columns]>

In [2]:
import tempfile
from moodlemlbackend.model import TFModel

# Setup the Tensorflow based Estimator (currently a Neural Network)
temp_directory = tempfile.mkdtemp()
model_id = 1
with tempfile.TemporaryDirectory() as tmpdirname:
    tf = TFModel(X_train.shape[1], len(np.unique(y_train)), 1000, 1000, 0.5, tmpdirname)
    tf.fit(X_train.values, y_train.values)

tf_score = accuracy_score(y_holdout, tf.predict(X_holdout))

In [3]:
# EvalML implementation
from moodlemlbackend.model import EvalMlModel

runner = EvalMlModel()
best_pipeline = runner.fit(X_train.values, y_train.values)

print("############################# Rankings #########################################")
print(runner.automl.rankings)
print("################################################################################")
print("############################# Pipeiline ########################################")
best_pipeline.describe()
print("################################################################################")

evalml_score = accuracy_score(y_holdout, best_pipeline.predict(X_holdout))

Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Accuracy Binary. 
Greater score is better.

Searching up to 10 pipelines. 
Allowed model families: extra_trees, random_forest, xgboost, linear_model, lightgbm, catboost



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

(1/10) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Accuracy Binary: 0.626
(2/10) LightGBM Classifier w/ Imputer           Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Accuracy Binary: 0.956
(3/10) Extra Trees Classifier w/ Imputer        Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Accuracy Binary: 0.947
(4/10) Elastic Net Classifier w/ Imputer + S... Elapsed:00:02
	Starting cross validation
	Finished cross validation - mean Accuracy Binary: 0.949
(5/10) CatBoost Classifier w/ Imputer           Elapsed:00:02
	Starting cross validation
	Finished cross validation - mean Accuracy Binary: 0.947
(6/10) XGBoost Classifier w/ Imputer            Elapsed:00:03
	Starting cross validation
	Finished cross validation - mean Accuracy Binary: 0.949
(7/10) Random Forest Classifier w/ Imputer      Elapsed:00:03
	Starting cross validation
	Finished cross validation - mean A

In [4]:
print("################")
print(f"EvalML score: {evalml_score}")
print(f"Tensor Flow score: {tf_score}")
print("################")

################
EvalML score: 0.9824561403508771
Tensor Flow score: 0.631578947368421
################
