In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-dec-2021/train.csv
/kaggle/input/tabular-playground-series-dec-2021/test.csv


In [2]:
import random
import warnings
import gc
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

In [3]:
seed = 47

In [4]:
def evaluate_model(model, x, y):
    y_pred_prob = model.predict(x)
    acc = accuracy_score(y, y_pred_prob)
    return {'accuracy' : acc}

In [5]:
def get_xgboost_model(params=None):
    if params is None:
        params = {'colsample_bytree': 0.1,
                  'eta': 0.12,
                  'gamma': 5, 
                  'max_depth': 2,
                  'min_child_weight': 9,
                  'n_estimators': 1000, 
                  'subsample': 0.9}          

    return XGBClassifier(**params,
                         objective='multi:softmax',
                         random_state=seed, 
                         tree_method='gpu_hist', 
                         predictor='gpu_predictor',
                         early_stopping_rounds=200,
                         verbosity=0)

In [6]:
def get_nn_model(n_layers=None, n_units=32, activation='swish'):
    model = tf.keras.Sequential()
    
    if n_layers is not None and n_layers > 0:
        for _ in range(n_layers):
            model.add(tf.keras.layers.Dense(units=n_units, activation=activation))
    model.add(tf.keras.layers.Dense(units=7, activation='softmax'))
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=[tf.metrics.SparseCategoricalAccuracy()])

    return model

In [7]:
def get_pipelines(model):
    pipelines = list()
    # normalize
    p = Pipeline([('s',MinMaxScaler()), ('m',model)])
    pipelines.append(('norm', p))
    # standardize
    p = Pipeline([('s',StandardScaler()), ('m',model)])
    pipelines.append(('std', p))
    # quantile
    p = Pipeline([('s',QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('m',model)])
    pipelines.append(('quan', p))
    # pca
    p = Pipeline([('s',PCA()), ('m',model)])
    pipelines.append(('pca', p))
    # svd
    p = Pipeline([('s',TruncatedSVD()), ('m',model)])
    pipelines.append(('svd', p))
    
    p = Pipeline([('s',StandardScaler()), ('p', PowerTransformer()), ('m',model)])
    pipelines.append(('std-power', p))
    # scale and power
    p = Pipeline([('s',MinMaxScaler()), ('p', PowerTransformer()), ('m',model)])
    pipelines.append(('min-max-power', p))
    
    p = Pipeline([('p', PowerTransformer()), ('m',model)])
    pipelines.append(('power', p))
    
    return pipelines

In [8]:
def score_model(x, y, model):
    # define the cross-validation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)
    # evaluate model
    scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

In [9]:
geomean = lambda x, axis : np.exp(np.mean(np.log(x), axis=axis))
harmonic_mean = lambda x, axis : len(x) / np.sum(1.0/x, axis=axis) 

funcs = {'mean' : np.mean, 
         'std' : np.std, 
         'var' : np.var, 
         'geo_mean' : geomean, 
         'harmonic_mean' : harmonic_mean, 
         'median' : np.median,
         'None_feature_engineering' : None}

In [10]:
# train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',')
random.seed(seed)
n = 4000000
s = 400000
skip = sorted(random.sample(range(1, n),n-s))

train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',', skiprows=skip)

# XGBoost Baseline

In [11]:
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1).values
y_train = train_df['Cover_Type'].values 
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)

In [12]:
params = {'colsample_bytree': 0.1,
          'eta': 0.12,
          'gamma': 5, 
          'max_depth': 2,
          'min_child_weight': 9,
          'n_estimators': 1000, 
          'subsample': 0.9}          

model = XGBClassifier(**params, 
                      objective='multi:softmax',
                      random_state=seed, 
                      tree_method='gpu_hist', 
                      predictor='gpu_predictor',
                      early_stopping_rounds=200,
                      verbosity=0)

model.fit(x_train, y_train)
results = evaluate_model(model, x_test, y_test)
print(results)

{'accuracy': 0.9377375}


# Feature Engineering

Here wee will experiment creating synthetic features using central tendency statistics.

<h3>Feature Engineering XGBoost</h3>


In [13]:
results, names = list(), list()

for key in funcs.keys():
    x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
    if funcs[key] is not None:
        x_train[key] = funcs[key](x_train, axis=1)
    y_train = train_df['Cover_Type']
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = seed, shuffle=True)
    model = get_xgboost_model()
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    names.append(key)
    results.append(result['accuracy'])
    
for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

index = np.argmax(results)
print("Best Result: ", names[index], results[index])

>mean: 0.940462
>std: 0.939725
>var: 0.939725
>geo_mean: 0.937475
>harmonic_mean: 0.937475
>median: 0.937475
>None_feature_engineering: 0.937738
Best Result:  mean 0.9404625


# XGBoost - Testing different configurations

In [14]:
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
y_train = train_df['Cover_Type']
x_train['mean'] = np.mean(x_train, axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)

# 1 - Testing different number of estimators

In [15]:
def get_models_n_estimators():
    models = dict()
    trees = [10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
    for n in trees:
        params = {'n_estimators' : n}
        models[str(n)] = get_xgboost_model(params)
    return models

In [16]:
models = get_models_n_estimators()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
n_estimators = int(names[index])
print("Best number of estimators", n_estimators)

10 accuracy: 0.938
50 accuracy: 0.956
100 accuracy: 0.959
150 accuracy: 0.959
200 accuracy: 0.959
250 accuracy: 0.959
300 accuracy: 0.959
350 accuracy: 0.959
400 accuracy: 0.959
450 accuracy: 0.959
500 accuracy: 0.959
1000 accuracy: 0.959
2000 accuracy: 0.959
3000 accuracy: 0.958
4000 accuracy: 0.959
5000 accuracy: 0.959
6000 accuracy: 0.958
7000 accuracy: 0.958
8000 accuracy: 0.958
9000 accuracy: 0.958
10000 accuracy: 0.958
Best number of estimators 300


# 2 - Testing different max_depth

In [17]:
def get_models_n_depths():
    models = dict()
    for depth in range(1,20):
        params = {'n_estimators' : n_estimators, 'max_depth' : depth}
        models[str(depth)] = get_xgboost_model(params)
    return models

In [18]:
models = get_models_n_depths()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
max_depth = int(names[index])
print("Best max depth", max_depth)

1 accuracy: 0.930
2 accuracy: 0.950
3 accuracy: 0.957
4 accuracy: 0.959
5 accuracy: 0.959
6 accuracy: 0.959
7 accuracy: 0.959
8 accuracy: 0.959
9 accuracy: 0.959
10 accuracy: 0.959
11 accuracy: 0.959
12 accuracy: 0.959
13 accuracy: 0.959
14 accuracy: 0.959
15 accuracy: 0.959
16 accuracy: 0.959
17 accuracy: 0.959
18 accuracy: 0.960
19 accuracy: 0.959
Best max depth 18


# 3 - Testing different subsamples


In [19]:
def get_models_subsamples():
    models = dict()
    for subsample in np.arange(0.1, 1.1, 0.1):
        params = {'n_estimators' : n_estimators, 'max_depth' : max_depth, 'subsample' : subsample}
        key = '%.1f' % subsample
        models[key] = get_xgboost_model(params)
    return models

In [20]:
models = get_models_subsamples()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
subsample = float(names[index])
print("Best subsample", subsample)

0.1 accuracy: 0.948
0.2 accuracy: 0.954
0.3 accuracy: 0.956
0.4 accuracy: 0.958
0.5 accuracy: 0.958
0.6 accuracy: 0.958
0.7 accuracy: 0.958
0.8 accuracy: 0.959
0.9 accuracy: 0.960
1.0 accuracy: 0.960
Best subsample 1.0


# 4 - Testing different learning rates

In [21]:
def get_models_lr():
    models = dict()
    rates = [0.0001, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.12, 0.13, 0.3, 0.5, 1.0]
    for r in rates:
        params = {'n_estimators' : n_estimators, 'max_depth' : max_depth, 'subsample' : subsample, 'eta' : r}
        key = '%.4f' % r
        models[key] = get_xgboost_model(params)
    return models

In [22]:
models = get_models_lr()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
eta = float(names[index])
print("Best learning rate", eta)

0.0001 accuracy: 0.944
0.0010 accuracy: 0.948
0.0030 accuracy: 0.951
0.0050 accuracy: 0.953
0.0100 accuracy: 0.956
0.0300 accuracy: 0.958
0.0500 accuracy: 0.958
0.1000 accuracy: 0.959
0.1200 accuracy: 0.959
0.1300 accuracy: 0.959
0.3000 accuracy: 0.960
0.5000 accuracy: 0.959
1.0000 accuracy: 0.958
Best learning rate 0.3


# 5 - Testing different number of features

In [23]:
def get_models_nfeatures():
    models = dict()
    for i in np.arange(0.1, 1.1, 0.1):
        params = {'n_estimators' : n_estimators, 'max_depth' : max_depth, 'subsample' : subsample, 'eta' : eta, 'colsample_bytree' : i}
        key = '%.1f' % i
        models[key] = get_xgboost_model(params)
    return models

In [24]:
models = get_models_nfeatures()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
colsample_bytree = float(names[index])
print("Best colsample_bytree", colsample_bytree)

0.1 accuracy: 0.928
0.2 accuracy: 0.946
0.3 accuracy: 0.953
0.4 accuracy: 0.957
0.5 accuracy: 0.958
0.6 accuracy: 0.958
0.7 accuracy: 0.959
0.8 accuracy: 0.959
0.9 accuracy: 0.959
1.0 accuracy: 0.960
Best colsample_bytree 1.0


# 6 - Testing different number of gamma

In [25]:
def get_models_n_gamma():
    models = dict()
    # for gamma in range(1,20):
    for gamma in np.arange(0.0, 1.1, 0.1):
        params = {'n_estimators' : n_estimators, 
                  'max_depth' : max_depth,
                  'subsample' : subsample,
                  'eta' : eta, 
                  'colsample_bytree' : colsample_bytree,
                  'gamma' : gamma}
        models[str(gamma)] = get_xgboost_model(params)
    return models

In [26]:
models = get_models_n_gamma()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
gamma = float(names[index])
print("Best gamma", gamma)

0.0 accuracy: 0.960
0.1 accuracy: 0.959
0.2 accuracy: 0.958
0.30000000000000004 accuracy: 0.958
0.4 accuracy: 0.959
0.5 accuracy: 0.959
0.6000000000000001 accuracy: 0.958
0.7000000000000001 accuracy: 0.958
0.8 accuracy: 0.959
0.9 accuracy: 0.959
1.0 accuracy: 0.959
Best gamma 0.0


# 7 - Testing different number of min_child_weight

In [27]:
def get_models_n_min_child_weight():
    models = dict()
    for min_child_weight in range(1,20):
        params = {'n_estimators' : n_estimators, 
                  'max_depth' : max_depth,
                  'subsample' : subsample,
                  'eta' : eta, 
                  'colsample_bytree' : colsample_bytree,
                  'gamma' : gamma,
                  'min_child_weight' : min_child_weight}
        models[str(min_child_weight)] = get_xgboost_model(params)
    return models

In [28]:
models = get_models_n_min_child_weight()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
min_child_weight = int(names[index])
print("Best min_child_weight", min_child_weight)

1 accuracy: 0.960
2 accuracy: 0.959
3 accuracy: 0.959
4 accuracy: 0.960
5 accuracy: 0.959
6 accuracy: 0.959
7 accuracy: 0.959
8 accuracy: 0.959
9 accuracy: 0.959
10 accuracy: 0.959
11 accuracy: 0.959
12 accuracy: 0.959
13 accuracy: 0.959
14 accuracy: 0.959
15 accuracy: 0.959
16 accuracy: 0.959
17 accuracy: 0.959
18 accuracy: 0.959
19 accuracy: 0.959
Best min_child_weight 1


# 8 - Testing different number of reg_alpha

In [29]:
def get_models_n_reg_alpha():
    models = dict()
    for reg_alpha in [0, 1e-5, 1e-2, 0.1, 0.01, 0.001, 0.003, 1, 10, 100]:
        params = {'n_estimators' : n_estimators, 
                  'max_depth' : max_depth,
                  'subsample' : subsample,
                  'eta' : eta, 
                  'colsample_bytree' : colsample_bytree,
                  'gamma' : gamma,
                  'min_child_weight' : min_child_weight,
                  'reg_alpha': reg_alpha}
        models[str(reg_alpha)] = get_xgboost_model(params)
    return models

In [30]:
models = get_models_n_min_child_weight()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
reg_alpha = int(names[index])
print("reg_alpha", reg_alpha)

1 accuracy: 0.960
2 accuracy: 0.959
3 accuracy: 0.959
4 accuracy: 0.960
5 accuracy: 0.959
6 accuracy: 0.959
7 accuracy: 0.959
8 accuracy: 0.959
9 accuracy: 0.959
10 accuracy: 0.959
11 accuracy: 0.959
12 accuracy: 0.959
13 accuracy: 0.959
14 accuracy: 0.959
15 accuracy: 0.959
16 accuracy: 0.959
17 accuracy: 0.959
18 accuracy: 0.959
19 accuracy: 0.959
reg_alpha 1


In [31]:
params = {'n_estimators' : n_estimators,
          'max_depth' : max_depth,
          'subsample' : subsample,
          'eta' : eta, 
          'colsample_bytree' : colsample_bytree,
          'gamma' : gamma,
          'min_child_weight' : min_child_weight,
          'reg_alpha' : reg_alpha}
         
print('Best Params: ', params)

Best Params:  {'n_estimators': 300, 'max_depth': 18, 'subsample': 1.0, 'eta': 0.3, 'colsample_bytree': 1.0, 'gamma': 0.0, 'min_child_weight': 1, 'reg_alpha': 1}


In [32]:
model = get_xgboost_model(params)
model.fit(x_train, y_train, verbose=True)
score = evaluate_model(model, x_test, y_test)
print(score)

{'accuracy': 0.959275}


In [33]:
params = {'n_estimators' : n_estimators,
          'max_depth' : max_depth,
          'subsample' : subsample,
          'eta' : eta, 
          'colsample_bytree' : colsample_bytree,
          'gamma' : gamma,
          'min_child_weight' : min_child_weight,
          'reg_alpha' : reg_alpha}
model = get_xgboost_model(params)
model.fit(x_train, y_train, verbose=True)
score = evaluate_model(model, x_test, y_test)
print(score)

{'accuracy': 0.959275}


# Submission

In [34]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',')
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
y_train = train_df['Cover_Type']
x_train['mean'] = np.mean(x_train, axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)

In [35]:
params = {'n_estimators' : n_estimators,
          'max_depth' : max_depth,
          'subsample' : subsample,
          'eta' : eta, 
          'colsample_bytree' : colsample_bytree,
          'gamma' : gamma,
          'min_child_weight' : min_child_weight,
          'reg_alpha' : reg_alpha}
        
model = get_xgboost_model(params)
model.fit(x_train, y_train, verbose=True)
score = evaluate_model(model, x_test, y_test)
print(score)

{'accuracy': 0.96060375}


In [36]:
del train_df, x_train, y_train, x_test, y_test
gc.collect()

292

In [37]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv', sep=',')
x_test = test_df.drop(['Id', 'Soil_Type7','Soil_Type15'], axis=1)
x_test['mean'] = np.mean(x_test, axis=1)

In [38]:
target = model.predict(x_test).squeeze()
ids = test_df['Id'].values
submission_xgboost = pd.DataFrame({'Id' : ids, 'Cover_Type' : target})

In [39]:
submission_xgboost.head()

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2


In [40]:
submission_xgboost.to_csv('submission_xgboost.csv', index=False) # score 0.95378

In [41]:
del test_df, x_test
gc.collect()

42

# Neural Network Baseline

In [42]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',', skiprows=skip)

In [43]:
scaler = StandardScaler()
le = LabelEncoder()

x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)

y_train = train_df['Cover_Type'].values 
y_train = le.fit_transform(y_train)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [44]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units=7, activation='softmax'))
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])

model.compile(optimizer=tf.keras.optimizers.Adam(), 
               loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[tf.metrics.SparseCategoricalAccuracy()])

model.fit(x_train, y_train, batch_size=32, epochs=20)
model.evaluate(x_test, y_test)

2021-12-13 05:50:04.800350: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-13 05:50:04.801514: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-13 05:50:04.802150: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-13 05:50:04.803801: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.22573627531528473, 0.918037474155426]

<h3>Feature Engineering Neural Network Model</h3>

In [45]:
scaler = StandardScaler()
le = LabelEncoder()

results, names = list(), list()

for key in funcs.keys():
    x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
    if funcs[key] is not None:
        x_train[key] = funcs[key](x_train, axis=1)
    y_train = train_df['Cover_Type'].values
    y_train = le.fit_transform(y_train)
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = seed, shuffle=True)
    x_train = scaler.fit_transform(x_train.values)
    x_test = scaler.transform(x_test)    
    model = get_nn_model()
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    names.append(key)
    results.append(result)
    
for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

index = np.argmax(results)
print("Best Result: ", names[index], results[index])

>mean: 0.917763
>std: 0.920187
>var: 0.919288
>geo_mean: 0.917925
>harmonic_mean: 0.919088
>median: 0.917938
>None_feature_engineering: 0.917512
Best Result:  std 0.9201874732971191


<h3>Neural Network Pipelines</h3>

In [46]:
transformers = {'Min-Max-Scaler': MinMaxScaler(), 
                'Standard-Scaler': StandardScaler(),
                'QuantileTransformer': QuantileTransformer(n_quantiles=100, output_distribution='normal'),
                'PCA': PCA(),
                'TruncatedSVD': TruncatedSVD(),
                'PowerTransformer': PowerTransformer(),
                'No-transformer': None}

In [47]:
results, names = list(), list()

for key in transformers.keys():
    x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
    y_train = train_df['Cover_Type'].values
    y_train = le.fit_transform(y_train)
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)
    
    if transformers[key] is not None:
        x_train = transformers[key].fit_transform(x_train.values)
        x_test = transformers[key].transform(x_test)    
    
    model = get_nn_model()
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    names.append(key)
    results.append(result)
    
for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

index = np.argmax(results)
print("Best Result: ", names[index], results[index])

>Min-Max-Scaler: 0.904275
>Standard-Scaler: 0.918650
>QuantileTransformer: 0.902375
>PCA: 0.883762
>TruncatedSVD: 0.559200
>PowerTransformer: 0.915987
>No-transformer: 0.813275
Best Result:  Standard-Scaler 0.9186499714851379


# Neural Network - Testing different configurations

In [48]:
scaler = StandardScaler()
le = LabelEncoder()
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
y_train = train_df['Cover_Type'].values 
x_train['std'] = np.std(x_train, axis=1)
y_train = le.fit_transform(y_train)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 1 - Testing different number o layers

In [49]:
def get_models_n_layers():
    models = dict()
    for n_layers in [0, 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10]:
        models[n_layers] = get_nn_model(n_layers=n_layers)
    return models

In [50]:
models =  get_models_n_layers()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    results.append(result)
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
n_layers = int(names[index])
print("Best number of layers", n_layers)

0 accuracy: 0.921
1 accuracy: 0.948
2 accuracy: 0.954
3 accuracy: 0.956
4 accuracy: 0.955
5 accuracy: 0.955
6 accuracy: 0.954
7 accuracy: 0.955
8 accuracy: 0.953
9 accuracy: 0.955
10 accuracy: 0.952
Best number of layers 3


# 2 - Testing different number of units

In [51]:
def get_models_n_units():
    models = dict()
    for n_units in [8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
        models[n_units] = get_nn_model(n_layers=n_layers, n_units=n_units)
    return models

In [52]:
models =  get_models_n_units()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    results.append(result)
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
n_units = int(names[index])
print("Best number of units", n_units)

8 accuracy: 0.947
16 accuracy: 0.951
32 accuracy: 0.956
64 accuracy: 0.956
128 accuracy: 0.953
256 accuracy: 0.954
512 accuracy: 0.954
1024 accuracy: 0.954
2048 accuracy: 0.954
Best number of units 64


# 3 - Testing different activation functions

In [53]:
def get_models_n_activations():
    models = dict()
    for activation in ["swish", "relu", "selu", "softplus", "elu"]:
        models[activation] = get_nn_model(n_layers=n_layers, n_units=n_units, activation=activation)
    return models

In [54]:
models = get_models_n_activations()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    results.append(result)
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
activation = names[index]
print("Best activation function", activation)

swish accuracy: 0.953
relu accuracy: 0.955
selu accuracy: 0.953
softplus accuracy: 0.956
elu accuracy: 0.954
Best activation function softplus


# 4 testing different number of batch

In [55]:
results, names = list(), list()
batches = [8, 16, 32, 64, 128, 256, 512]

for i, (name, batch_size) in enumerate(zip(batches, batches)):
    get_nn_model(n_layers=n_layers, n_units=n_units, activation=activation)
    model.fit(x_train, y_train, batch_size=batch_size, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    results.append(result)
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
batch_size = int(names[index])
print("Best batch_size", batch_size)

8 accuracy: 0.954
16 accuracy: 0.955
32 accuracy: 0.955
64 accuracy: 0.955
128 accuracy: 0.956
256 accuracy: 0.955
512 accuracy: 0.955
Best batch_size 128


In [56]:
print("Best parameters")
print("n_layers:", n_layers)
print("n_units:", n_units)
print("activation:", activation)
print("batch_size:", batch_size)

Best parameters
n_layers: 3
n_units: 64
activation: softplus
batch_size: 128


# Submission

In [57]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',', skiprows=skip)
scaler = StandardScaler()
le = LabelEncoder()
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
y_train = train_df['Cover_Type'].values 
x_train['std'] = np.std(x_train, axis=1)
y_train = le.fit_transform(y_train)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [58]:
model = get_nn_model(n_layers=n_layers, n_units=n_units, activation=activation)
model.fit(x_train, y_train, batch_size=batch_size, epochs=15)
score = model.evaluate(x_test, y_test, verbose=0)[1]
print(score)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
0.9544000029563904


In [59]:
del train_df, x_train, y_train, x_test, y_test
gc.collect()

26462

In [60]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv', sep=',')
x_test = test_df.drop(['Id', 'Soil_Type7','Soil_Type15'], axis=1)
x_test['std'] = np.std(x_test, axis=1)
x_test = scaler.transform(x_test)

In [61]:
preds = model.predict(x_test)
target = np.argmax(preds, axis=-1)
ids = test_df['Id'].values
submission_nn = pd.DataFrame({'Id' : ids, 'Cover_Type' : target + 1})

In [62]:
submission_nn = pd.DataFrame({'Id' : ids, 'Cover_Type' : target + 1})

In [63]:
submission_nn.head()

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2


In [64]:
submission_nn.to_csv('submission_nn.csv', index=False) # score 0.93079

In [65]:
del test_df, x_test
gc.collect()

799

# Ensemble XGBoost and Neural Network

In [66]:
df_submission_xgboost = pd.read_csv('submission_xgboost.csv')
df_submission_nn = pd.read_csv('submission_nn.csv')
ids = df_submission_xgboost['Id'].values
submission_ensemble = pd.DataFrame({'Id' : ids,
                           'Cover_Type' : np.array(df_submission_xgboost['Cover_Type'].values + df_submission_nn['Cover_Type'].values)//2})


In [67]:
submission_ensemble .head()

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2


In [68]:
submission_ensemble.to_csv('submission_ensemble.csv', index=False) # 0.93155

# Submission Best Model - XGboost

In [69]:
submission_xgboost.to_csv('submission.csv', index=False)