## This demonstrates scikit-learn classfication for comparison with Tribuo classfication

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# This dataset is prepared in the notebook: scikit-learn Classifier - Data Cleanup
df = pd.read_csv('../../data/cleanedWeatherAUS.csv', header=None)
# print(df)

In [3]:
X = np.array(df[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 
                 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 
                 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]])
y = np.array(df[62])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print('Training data size = %d, number of features = %d' % (len(X_train), len(df.columns) - 1))
print('Testing data size = %d, number of features = %d' % (len(X_test), len(df.columns) - 1))

Training data size = 112629, number of features = 62
Testing data size = 28158, number of features = 62


In [4]:
sgd = SGDClassifier(epsilon=0.1)

lr = LogisticRegression(tol=0.1, solver='liblinear')

cart = DecisionTreeClassifier()

mlp = MLPClassifier(max_iter=300)

In [5]:
print(sgd)
print(lr)
print(cart)
print(mlp)

SGDClassifier()
LogisticRegression(solver='liblinear', tol=0.1)
DecisionTreeClassifier()
MLPClassifier(max_iter=300)


In [6]:
%time sgd.fit(X_train, y_train)

# run 1
# time:  0.67s

# run 2
# time:  0.72s

# run 3
# time:  0.55s

CPU times: user 519 ms, sys: 6.17 ms, total: 525 ms
Wall time: 545 ms


SGDClassifier()

In [7]:
predicted = sgd.predict(X_test)
print(classification_report(y_test, predicted))

# run 1
#               precision    recall  f1-score   support
#           No       0.85      0.96      0.90     21918
#          Yes       0.75      0.41      0.53      6240

# run 2
#               precision    recall  f1-score   support
#           No       0.86      0.96      0.90     21918
#          Yes       0.74      0.43      0.55      6240

# run 3
#               precision    recall  f1-score   support
#           No       0.85      0.97      0.90     21918
#          Yes       0.76      0.39      0.52      6240

              precision    recall  f1-score   support

          No       0.85      0.97      0.90     21918
         Yes       0.76      0.39      0.52      6240

    accuracy                           0.84     28158
   macro avg       0.80      0.68      0.71     28158
weighted avg       0.83      0.84      0.82     28158



In [8]:
%time lr.fit(X_train, y_train)

# run 1
# time:  0.46s

# run 2
# time:  0.49s

# run 3
# time:  0.62s

CPU times: user 538 ms, sys: 21.3 ms, total: 560 ms
Wall time: 624 ms


LogisticRegression(solver='liblinear', tol=0.1)

In [9]:
predicted = lr.predict(X_test)
print(classification_report(y_test, predicted))

# run 1
#               precision    recall  f1-score   support
#           No       0.86      0.95      0.90     21918
#          Yes       0.70      0.46      0.55      6240

# run 2
#               precision    recall  f1-score   support
#           No       0.86      0.95      0.90     21918
#          Yes       0.70      0.46      0.55      6240

# run 3
#               precision    recall  f1-score   support
#           No       0.86      0.95      0.90     21918
#          Yes       0.70      0.46      0.55      6240

              precision    recall  f1-score   support

          No       0.86      0.95      0.90     21918
         Yes       0.70      0.46      0.55      6240

    accuracy                           0.84     28158
   macro avg       0.78      0.70      0.73     28158
weighted avg       0.83      0.84      0.82     28158



In [10]:
%time cart.fit(X_train, y_train)

# run 1
# time:  2.36 s

# run 2
# time:  2.41 s

# run 2
# time:  2.50 s

CPU times: user 2.4 s, sys: 37.8 ms, total: 2.44 s
Wall time: 2.5 s


DecisionTreeClassifier()

In [11]:
predicted = cart.predict(X_test)
print(classification_report(y_test, predicted))

# run 1
#               precision    recall  f1-score   support
#           No       0.86      0.86      0.86     21918
#          Yes       0.51      0.52      0.51      6240

# run 2
#               precision    recall  f1-score   support
#           No       0.86      0.86      0.86     21918
#          Yes       0.51      0.52      0.51      6240

# run 3
#               precision    recall  f1-score   support
#           No       0.86      0.86      0.86     21918
#          Yes       0.51      0.52      0.52      6240

              precision    recall  f1-score   support

          No       0.86      0.86      0.86     21918
         Yes       0.51      0.52      0.51      6240

    accuracy                           0.78     28158
   macro avg       0.69      0.69      0.69     28158
weighted avg       0.78      0.78      0.78     28158



In [12]:
%time mlp.fit(X_train, y_train)

# run 1
# time:  2min 35s

# run 2
# time:  2min 32s

# run 3
# time:  2min 50s

CPU times: user 5min 6s, sys: 20.9 s, total: 5min 27s
Wall time: 2min 50s


MLPClassifier(max_iter=300)

In [13]:
predicted = mlp.predict(X_test)
print(classification_report(y_test, predicted))

# run 1
#               precision    recall  f1-score   support
#           No       0.86      0.96      0.91     21918
#          Yes       0.75      0.46      0.57      6240

# run 2
#               precision    recall  f1-score   support
#           No       0.87      0.94      0.90     21918
#          Yes       0.71      0.50      0.59      6240

# run 3
#               precision    recall  f1-score   support
#           No       0.87      0.94      0.91     21918
#          Yes       0.71      0.53      0.61      6240

              precision    recall  f1-score   support

          No       0.87      0.94      0.91     21918
         Yes       0.71      0.53      0.61      6240

    accuracy                           0.85     28158
   macro avg       0.79      0.73      0.76     28158
weighted avg       0.84      0.85      0.84     28158



In [14]:
# print(confusion_matrix(y_test, predicted, labels=['No', 'Yes']))