In [1]:
using CSVFiles, CSV, DataFrames, ScikitLearn, PyCall

In [2]:
# This cell imports all necessary items from scikit-learn
@sk_import model_selection: train_test_split
@sk_import preprocessing: OneHotEncoder
@sk_import model_selection: GridSearchCV
@sk_import svm: LinearSVC
@sk_import metrics: accuracy_score
@sk_import naive_bayes: CategoricalNB
@sk_import naive_bayes: GaussianNB
@sk_import tree: DecisionTreeClassifier
@sk_import svm: SVC
@sk_import neural_network: MLPClassifier
@sk_import preprocessing: LabelEncoder
joblib = pyimport("joblib");

└ @ ScikitLearn.Skcore C:\Users\Cameron Kozlin\.julia\packages\ScikitLearn\NJwUf\src\Skcore.jl:179
└ @ ScikitLearn.Skcore C:\Users\Cameron Kozlin\.julia\packages\ScikitLearn\NJwUf\src\Skcore.jl:179


In [3]:
# This function is called 16 times throughout the program; it is used every time that a model needs to be trained
# and used for predictions. It outputs the time it took to train (fit), the time it takes to predict, the model object used,
# and the accuracy of the model when applied to test data. It also outputs the size in bytes.

function train_predict_accuracy(model, X_train, y_train, X_test, y_test)
    print("Fit time: ")
    @time fit!(model, X_train, y_train)
    print("Prediction time: ")
    @time pred = predict(model, X_test)
    println("Model: $model")
    accuracy = accuracy_score(pred, y_test)
    # Convert accuracy to percentage, round to 1 decimal
    accuracy = accuracy * 100
    accuracy = round(accuracy, digits=1)
    println("Accuracy: $accuracy%")
    joblib.dump(model, "model_file")
    n = filesize("model_file")
    # Convert file size to Kb, round to 2 decimals
    n = n / 1000
    n = round(n, digits=2)
    println("Size: $n Kb")
end

train_predict_accuracy (generic function with 1 method)

In [4]:
# Create objects for encoders and models

enc = OneHotEncoder()
le = LabelEncoder()
NB_model = CategoricalNB()
tree_model = DecisionTreeClassifier()
SVM_model = SVC()
net_model = MLPClassifier();

In [5]:
# CAR CLASSIFICATION SECTION

# read in car data as DF, encode X, split into input (X) and output (y)
df = CSV.File(read("car.data"); header=0) |> DataFrame;

X = convert(Array, df[:,1:6])
enc.fit(X)
X = enc.transform(X).toarray()

y = convert(Array, df[:,7]);

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3);

In [7]:
train_predict_accuracy(NB_model, X_train, y_train, X_test, y_test)

Fit time:   0.075241 seconds (33.10 k allocations: 1.713 MiB)
Prediction time:   0.381166 seconds (225.56 k allocations: 11.506 MiB, 9.30% gc time)
Model: PyObject CategoricalNB()
Accuracy: 87.3%
Size: 5.34 Kb


In [8]:
train_predict_accuracy(tree_model, X_train, y_train, X_test, y_test)

Fit time:   0.032569 seconds (1.23 k allocations: 19.734 KiB)
Prediction time:   0.002740 seconds (3.16 k allocations: 98.812 KiB)
Model: PyObject DecisionTreeClassifier()
Accuracy: 96.9%
Size: 17.18 Kb


In [9]:
train_predict_accuracy(SVM_model, X_train, y_train, X_test, y_test)

Fit time:   0.153667 seconds (1.23 k allocations: 19.734 KiB)
Prediction time:   0.088137 seconds (3.16 k allocations: 98.812 KiB)
Model: PyObject SVC()
Accuracy: 96.5%
Size: 129.32 Kb


In [10]:
train_predict_accuracy(net_model, X_train, y_train, X_test, y_test)

Fit time:   3.282061 seconds (1.23 k allocations: 19.734 KiB)
Prediction time:   0.007714 seconds (3.16 k allocations: 98.812 KiB)
Model: PyObject MLPClassifier()
Accuracy: 97.1%




Size: 92.1 Kb


In [11]:
# ABALONE CLASSIFICATION

df = CSV.File(read("abalone.data"); header=0) |> DataFrame

X = convert(Array, df[:,1:8])

X[:,1] = le.fit_transform(X[:,1])

y = convert(Array, df[:,9])

# Sort ring counts into 3 groups (1-8, 9-10, 11+)
y[y.<=8] .= 1
y[y.>10] .= 3
y[y.>=9] .= 2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3);

In [12]:
# Switch to Gaussian NB model for the rest- 
# determined from testing as best tradeoff
NB_model = GaussianNB();

In [13]:
train_predict_accuracy(NB_model, X_train, y_train, X_test, y_test)

Fit time:   0.185179 seconds (217.28 k allocations: 9.388 MiB)
Prediction time:   0.056356 seconds (49.61 k allocations: 1.792 MiB)
Model: PyObject GaussianNB()
Accuracy: 57.3%
Size: 1.1 Kb


In [14]:
train_predict_accuracy(tree_model, X_train, y_train, X_test, y_test)

Fit time:   0.096770 seconds (52.63 k allocations: 822.969 KiB)
Prediction time:   0.018298 seconds (22.62 k allocations: 365.203 KiB)
Model: PyObject DecisionTreeClassifier()
Accuracy: 55.2%
Size: 124.78 Kb


In [15]:
train_predict_accuracy(SVM_model, X_train, y_train, X_test, y_test)

Fit time:   0.903276 seconds (52.63 k allocations: 822.969 KiB)
Prediction time:   0.769985 seconds (22.62 k allocations: 365.203 KiB)
Model: PyObject SVC()
Accuracy: 63.3%
Size: 220.49 Kb


In [16]:
train_predict_accuracy(net_model, X_train, y_train, X_test, y_test)

Fit time:   6.993016 seconds (52.63 k allocations: 822.969 KiB)
Prediction time:   0.026048 seconds (22.62 k allocations: 365.203 KiB)
Model: PyObject MLPClassifier()
Accuracy: 64.4%
Size: 47.16 Kb


In [17]:
# MADELON CLASSIFICATION

df = CSV.File(read("madelon_train.data"); header=0) |> DataFrame

df_y = CSV.File(read("madelon_train.labels"); header=0) |> DataFrame

# drop missing values read in by CSV
select!(df, Not(:Column501))

X = convert(Array, df)
y = convert(Array, df_y);

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3);

In [18]:
train_predict_accuracy(NB_model, X_train, y_train, X_test, y_test)

Fit time:   0.087689 seconds (31.21 k allocations: 1.671 MiB)


  return f(*args, **kwargs)


Prediction time:   0.089870 seconds (25.03 k allocations: 1.347 MiB, 37.01% gc time)
Model: PyObject GaussianNB()
Accuracy: 59.2%
Size: 16.7 Kb


In [19]:
train_predict_accuracy(tree_model, X_train, y_train, X_test, y_test)

Fit time:   0.840535 seconds (22 allocations: 1.125 KiB)
Prediction time:   0.002744 seconds (55 allocations: 7.656 KiB)
Model: PyObject DecisionTreeClassifier()
Accuracy: 66.7%
Size: 19.68 Kb


In [20]:
train_predict_accuracy(SVM_model, X_train, y_train, X_test, y_test)

Fit time:   1.145950 seconds (22 allocations: 1.125 KiB)
Prediction time: 

  return f(*args, **kwargs)


  0.878844 seconds (55 allocations: 7.656 KiB)
Model: PyObject SVC()
Accuracy: 62.8%
Size: 5207.24 Kb


In [21]:
train_predict_accuracy(net_model, X_train, y_train, X_test, y_test)

Fit time:   1.208510 seconds (22 allocations: 1.125 KiB)
Prediction time:   0.008020 seconds (55 allocations: 7.656 KiB)
Model: PyObject MLPClassifier()
Accuracy: 54.3%


  return f(*args, **kwargs)


Size: 1611.69 Kb


In [22]:
# KDD CUP CLASSIFICATION

# Read in the data as a DF, then separate into X and y (y is the final column)

df = CSV.File(read("kddcup.data_10_percent"); header=0) |> DataFrame
X = convert(Array, df[:, 1:41])
y = convert(Array, df[:, 42]);

In [23]:
# Label encode (le) columns 2-4 of the data
# converting them from string to float64
# so that they can be used with scikit-learn

X[:,2] = le.fit_transform(X[:,2])
X[:,3] = le.fit_transform(X[:,3])
X[:,4] = le.fit_transform(X[:,4]);

In [24]:
# Only training on 10% of the data here, because this saves lots of time w/o diminishing accuracy
# This may take a while due to size of dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9);

In [25]:
train_predict_accuracy(NB_model, X_train, y_train, X_test, y_test)

Fit time:   4.890046 seconds (4.23 M allocations: 65.521 MiB)
Prediction time:  55.092222 seconds (40.02 M allocations: 642.737 MiB, 5.09% gc time)
Model: PyObject GaussianNB()
Accuracy: 96.7%
Size: 14.62 Kb


In [26]:
train_predict_accuracy(tree_model, X_train, y_train, X_test, y_test)

Fit time:   3.987713 seconds (4.20 M allocations: 64.075 MiB)
Prediction time:  46.764713 seconds (40.02 M allocations: 642.731 MiB, 7.77% gc time)
Model: PyObject DecisionTreeClassifier()
Accuracy: 99.9%
Size: 41.7 Kb


In [27]:
# Use Linear SVC for this model to optimize
SVM_model = LinearSVC();

In [28]:
train_predict_accuracy(SVM_model, X_train, y_train, X_test, y_test)

Fit time:  40.061641 seconds (4.20 M allocations: 64.075 MiB, 5.43% gc time)
Prediction time: 



 40.783382 seconds (40.02 M allocations: 642.730 MiB, 5.56% gc time)
Model: PyObject LinearSVC()
Accuracy: 97.5%
Size: 8.32 Kb


In [29]:
train_predict_accuracy(net_model, X_train, y_train, X_test, y_test)

Fit time:  48.586223 seconds (4.20 M allocations: 64.075 MiB)
Prediction time:  43.205004 seconds (40.02 M allocations: 642.730 MiB, 4.62% gc time)
Model: PyObject MLPClassifier()
Accuracy: 99.5%
Size: 204.05 Kb
