In [1]:
import mlp # Multi-layer perceptron classifier
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from workflow import do_workflow

from collections import Counter
import numpy as np

import xgboost as xgb
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Read train dataset and subset into train and validation
train_df = pd.read_csv("clean_train_reduced.csv")

x_train=train_df.sample(frac=0.8,random_state=200) #random state is a seed value
x_val=train_df.drop(x_train.index)
del train_df # For memory reasons

# Take out target variable
y_train = list(x_train.pop("pollutant"))
y_val = list(x_val.pop("pollutant"))

In [3]:
# One-hot encode the targets, to make it suitable for the softmax last layer
targets = ["Methane (CH4)", "Nitrogen oxides (NOX)", "Carbon dioxide (CO2)"] 
for i in range(len(y_train)):
    if y_train[i] == "Methane (CH4)": y_train[i] = [1,0,0]
    elif y_train[i] == "Nitrogen oxides (NOX)": y_train[i] = [0,1,0]
    else: y_train[i] = np.array([0,0,1])
#y_train = principalDf = pd.DataFrame(data = y_train, columns = ["y_train"])
y_train = np.array(y_train)

for i in range(len(y_val)):
    if y_val[i] == "Methane (CH4)": y_val[i] = [1,0,0]
    elif y_val[i] == "Nitrogen oxides (NOX)": y_val[i] = [0,1,0]
    else: y_val[i] = np.array([0,0,1])
#y_val = principalDf = pd.DataFrame(data = y_val, columns = ["y_val"])
y_val = np.array(y_val)

print(y_train.shape, y_val.shape)

(52502, 3) (13126, 3)


In [4]:
# Fine-tune an XGBoost model follwing this tutorial: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#h2_6
best_model = do_workflow(
        dftrain=x_train,
        y_train=y_train,
        dftest=x_val,
        y_test=y_val,
        kfolds=2,
    )



Base model without tuning : 
Val accuracy: 63.23%

Step 1 : Estimate numbers of trees...
[0]	train-logloss:0.65548+0.00002	test-logloss:0.65589+0.00014
[50]	train-logloss:0.39741+0.00080	test-logloss:0.41288+0.00047
[100]	train-logloss:0.37257+0.00182	test-logloss:0.40565+0.00007
[150]	train-logloss:0.35381+0.00203	test-logloss:0.40264+0.00018
[200]	train-logloss:0.33664+0.00211	test-logloss:0.40026+0.00039
[250]	train-logloss:0.32095+0.00159	test-logloss:0.39864+0.00033
[300]	train-logloss:0.30661+0.00222	test-logloss:0.39716+0.00016
[350]	train-logloss:0.29346+0.00161	test-logloss:0.39641+0.00053
[400]	train-logloss:0.28133+0.00140	test-logloss:0.39591+0.00070
[450]	train-logloss:0.27035+0.00112	test-logloss:0.39562+0.00063
[500]	train-logloss:0.25983+0.00110	test-logloss:0.39533+0.00068
[550]	train-logloss:0.24959+0.00087	test-logloss:0.39541+0.00089
[568]	train-logloss:0.24642+0.00113	test-logloss:0.39541+0.00083
Initial num_boost_round :  520

Step 2 : Tune max_depth and min_chil

TypeError: type numpy.ndarray doesn't define __round__ method

In [11]:
# Validation accuracy
def multi_class_accuracy(preds, gt):
    hits = 0
    for i in range(gt.shape[0]):
        gt_label = np.argmax(gt[i])
        pred_label = np.argmax(preds[i])
        if gt_label==pred_label: hits+=1
    return hits/gt.shape[0]

bst = xgb.Booster()  # init model
bst.load_model('0001.model')  # load data
xgtest = xgb.DMatrix(x_val.values, feature_names=x_val.keys().values)
preds = bst.predict(xgtest)
print("Validation accuracy", multi_class_accuracy(preds, y_val))
xgtrain = xgb.DMatrix(x_train.values, feature_names=x_train.keys().values)
preds = bst.predict(xgtrain)
print("Training accuracy", multi_class_accuracy(preds, y_train))

Validation accuracy 0.6769008075575195
Training accuracy 0.89387070968725


In [10]:
# Generate the predictions csv with the best model
df_test = pd.read_csv("clean_test_reduced.csv")
test_index = list(df_test["test_index"])
df_test.drop(columns = "test_index", inplace = True) # Drop index to predict
xgtest = xgb.DMatrix(df_test.values, feature_names=df_test.keys().values)

preds = bst.predict(xgtest)
predictions=[np.argmax(preds[i]) for i in range(len(preds))]
for i in range(len(predictions)): # Transform to the required format
    if predictions[i] == 0: predictions[i] = 2
    elif predictions[i] == 1: predictions[i] = 0
    elif predictions[i] == 2: predictions[i] = 1
preds_df = pd.DataFrame(data = {'test_index': test_index, "pollutant": predictions})

preds_df.to_csv("predictions.csv", index = False) # Save predictions
preds_df.to_json('predictions.json')