# Modelling

In [2]:
# read in libraries
try:
    import re
    import pandas as pd
    import numpy as np

    from flaml import AutoML

    import sklearn.metrics as metrics

    from evidently.report import Report
    from evidently.metric_preset import ClassificationPreset

    from evidently.test_suite import TestSuite
    from evidently.test_preset import BinaryClassificationTestPreset
except:
    !pip install -r requirements.txt

In [4]:
# read in csv files from data preprocessing notebook
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

X1_test = pd.read_csv('X1_test.csv')
y1_test = pd.read_csv('y1_test.csv')

X2_test = pd.read_csv('X2_test.csv')
y2_test = pd.read_csv('y2_test.csv')

X_noise = pd.read_csv('X_noise.csv')
y_noise = pd.read_csv('y_noise.csv')

In [5]:
# change from DataFrame to Series for AutoML
y_train = y_train.squeeze()

In [6]:
# using FLAML - AutoML

# initialize an AutoML instance
automl = AutoML()
# specify AutoML goal and constraint
automl_settings = {
    "time_budget": 25, # seconds
    "metric": 'f1',
    "task": 'classification',
    "log_file_name": "automl.log",
}
# train with labeled input data
automl.fit(X_train, y_train, **automl_settings)

[flaml.automl.automl: 04-19 20:59:13] {2726} INFO - task = classification
[flaml.automl.automl: 04-19 20:59:13] {2728} INFO - Data split method: stratified
[flaml.automl.automl: 04-19 20:59:13] {2731} INFO - Evaluation method: holdout
[flaml.automl.automl: 04-19 20:59:13] {2858} INFO - Minimizing error metric: 1-f1
[flaml.automl.automl: 04-19 20:59:13] {3004} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.automl: 04-19 20:59:13] {3334} INFO - iteration 0, current learner lgbm
[flaml.automl.automl: 04-19 20:59:14] {3472} INFO - Estimated sufficient time budget=182669s. Estimated necessary time budget=4211s.
[flaml.automl.automl: 04-19 20:59:14] {3519} INFO -  at 4.9s,	estimator lgbm's best error=1.0000,	best estimator lgbm's best error=1.0000
[flaml.automl.automl: 04-19 20:59:14] {3334} INFO - iteration 1, current learner lgbm
[flaml.automl.automl: 04-19 20:59:15] {3519} INFO -  at 6.0s,	estimator lgbm's best err

In [7]:
# print the best model
print(automl.model.estimator)

ExtraTreesClassifier(criterion='entropy', max_features=0.29617042039215596,
                     max_leaf_nodes=1659, n_estimators=8, n_jobs=-1,
                     random_state=12032022)


In [6]:
# predict Test Batch 1
pred1 = automl.predict(X1_test)
print(pred1)

[0 0 0 ... 0 0 0]


In [7]:
# predict Test Batch 2
pred2 = automl.predict(X2_test)
print(pred2)

[0 0 0 ... 0 0 0]


In [8]:
# predict Noise Batch
pred_noise = automl.predict(X_noise)
print(pred_noise)

[0 0 0 ... 0 0 0]


In [9]:
# define functions that will output the result and reducue memory usage

def print_evaluate(true, predicted):  
    f1 = metrics.f1_score(true, predicted, average='macro')
    acc = metrics.accuracy_score(true, predicted)
    prec = metrics.precision_score(true, predicted, average='macro')
    recall = metrics.recall_score(true, predicted, average='macro')
    print('F1:', f1)
    print('Accuracy:', acc)
    print('Precision:', prec)
    print('Recall', recall)
    print('__________________________________')
    
def evaluate(true, predicted):
    f1 = metrics.f1_score(true, predicted, average='macro')
    acc = metrics.accuracy_score(true, predicted)
    prec = metrics.precision_score(true, predicted, average='macro')
    recall = metrics.recall_score(true, predicted, average='macro')
    return f1, acc, prec, recall

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        name =df[col].dtype.name 
        
        if col_type != object and col_type.name != 'category':
        #if name != "category":    
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [10]:
# checking the amount of similarities and differences for test batch 1

y1_test_array = y1_test.to_numpy()
y1_test_array = np.concatenate(y1_test_array, axis=0)
print("Number of differences: ", np.sum(y1_test_array != pred1))
print("Number of similarities: ", np.sum(y1_test_array == pred1))

Number of differences:  39
Number of similarities:  53705


In [11]:
# checking the amount of similarities and differences for test batch 2

y2_test_array = y2_test.to_numpy()
y2_test_array = np.concatenate(y2_test_array, axis=0)
print("Number of differences: ", np.sum(y2_test_array != pred2))
print("Number of similarities: ", np.sum(y2_test_array == pred2))

Number of differences:  49
Number of similarities:  77072


In [12]:
# checking the amount of similarities and differences for test batch 3 (noisy test batch)

y_noise_array = y_noise.to_numpy()
y_noise_array = np.concatenate(y_noise_array, axis=0)
print("Number of differences: ", np.sum(y_noise_array != pred_noise))
print("Number of similarities: ", np.sum(y_noise_array == pred_noise))

Number of differences:  6550
Number of similarities:  58882


In [13]:
print('Test set 1 evaluation:\n_____________________________________')
print_evaluate(y1_test, pred1)

results_df = pd.DataFrame(data=[["Test Batch", *evaluate(y1_test, pred1), 0]], 
            columns=['Model', 'F1', 'Accuracy', 'Precision', 'Recall', 'Cross Validation'])

Test set 1 evaluation:
_____________________________________
F1: 0.84865557571781
Accuracy: 0.9992743376004763
Precision: 0.8259472266661266
Recall 0.8747764697116459
__________________________________


In [14]:
print('Test set 2 evaluation:\n_____________________________________')
print_evaluate(y2_test, pred2)

results_df = pd.DataFrame(data=[["Test Batch 2", *evaluate(y2_test, pred2), 0]], 
            columns=['Model', 'F1', 'Accuracy', 'Precision', 'Recall', 'Cross Validation'])

Test set 2 evaluation:
_____________________________________
F1: 0.879151294197551
Accuracy: 0.9993646347946733
Precision: 0.8700494820823774
Recall 0.8887136142920204
__________________________________


In [15]:
print('Noise set evaluation:\n_____________________________________')
print_evaluate(y_noise, pred_noise)

results_df = pd.DataFrame(data=[["Test Batch 2", *evaluate(y_noise, pred_noise), 0]], 
            columns=['Model', 'F1', 'Accuracy', 'Precision', 'Recall', 'Cross Validation'])

Noise set evaluation:
_____________________________________
F1: 0.48277222720663976
Accuracy: 0.8998960753148306
Precision: 0.7434666958684226
Recall 0.5042784784531092
__________________________________


In [16]:
# preparing data for Evidently

pred1 = pd.DataFrame(pred1)
pred2 = pd.DataFrame(pred2)
pred_noise = pd.DataFrame(pred_noise)

trueTest1 = pd.concat([X1_test, y1_test], axis=1)
predTest1 = pd.concat([X1_test, pred1], axis=1)
trueTest1.rename(columns={'Class': 'target'}, inplace=True)

trueTest2 = pd.concat([X2_test, y2_test], axis=1)
predTest2 = pd.concat([X2_test, pred2], axis=1)
trueTest2.rename(columns={'Class': 'target'}, inplace=True)

trueTest_noise = pd.concat([X_noise, y_noise], axis=1)
predTest_noise = pd.concat([X_noise, pred_noise], axis=1)
trueTest_noise.rename(columns={'Class': 'target'}, inplace=True)

In [17]:
train = X_train.assign(target = y_train)

In [18]:
test1 = trueTest1
test1['prediction'] = predTest1[0]

test2 = trueTest2
test2['prediction'] = predTest2[0]

test_noise = trueTest_noise
test_noise['prediction'] = predTest_noise[0]

In [19]:
# reduce memory usage in all test batches to optimize space

train = reduce_mem_usage(train)
test1 = reduce_mem_usage(test1)
test2 = reduce_mem_usage(test2)
test_noise = reduce_mem_usage(test_noise)

Memory usage of dataframe is 45.71 MB
Memory usage after optimization is: 11.24 MB
Decreased by 75.4%
Memory usage of dataframe is 12.71 MB
Memory usage after optimization is: 3.08 MB
Decreased by 75.8%
Memory usage of dataframe is 18.24 MB
Memory usage after optimization is: 4.41 MB
Decreased by 75.8%
Memory usage of dataframe is 15.48 MB
Memory usage after optimization is: 3.74 MB
Decreased by 75.8%


In [20]:
classification_performance = Report(metrics=[
    ClassificationPreset()
])

### Classification Performance: Test Batch 1 vs Test Batch 2

In [21]:
classification_performance.run(reference_data=test1, current_data=test2)
classification_performance.save_html("Test1_Test2_Report.html")

### Classification Performance: Test Batch 2 vs Noisy Batch

In [22]:
classification_performance.run(reference_data=test2, current_data=test_noise)
classification_performance.save_html("Test2_Noise_Report.html")

### Classification Test Suite: Test Batch 1 vs Test Batch 2

In [23]:
label_binary_classification_performance = TestSuite(tests=[
    BinaryClassificationTestPreset(),
])

In [24]:
label_binary_classification_performance.run(reference_data=test1, current_data=test2)
label_binary_classification_performance.save_html("Test1_Test2_TestSuite.html")

### Classification Test Suite: Test Batch 2 vs Noisy Batch

In [25]:
label_binary_classification_performance.run(reference_data=test2, current_data=test_noise)
label_binary_classification_performance.save_html("Test2_Noise_TestSuite.html")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9dd4b1f4-9396-4609-8756-e5fddb45c25f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>