In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
cols = [
    'Elevation',
    'Aspect',
    'Slope',
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'Wilderness_Area1',
    'Wilderness_Area2',
    'Wilderness_Area3',
    'Wilderness_Area4',
    'Soil_Type1',
    'Soil_Type2',
    'Soil_Type3',
    'Soil_Type4',
    'Soil_Type5',
    'Soil_Type6',
    'Soil_Type7',
    'Soil_Type8',
    'Soil_Type9',
    'Soil_Type10',
    'Soil_Type11',
    'Soil_Type12',
    'Soil_Type13',
    'Soil_Type14',
    'Soil_Type15',
    'Soil_Type16',
    'Soil_Type17',
    'Soil_Type18',
    'Soil_Type19',
    'Soil_Type20',
    'Soil_Type21',
    'Soil_Type22',
    'Soil_Type23',
    'Soil_Type24',
    'Soil_Type25',
    'Soil_Type26',
    'Soil_Type27',
    'Soil_Type28',
    'Soil_Type29',
    'Soil_Type30',
    'Soil_Type31',
    'Soil_Type32',
    'Soil_Type33',
    'Soil_Type34',
    'Soil_Type35',
    'Soil_Type36',
    'Soil_Type37',
    'Soil_Type38',
    'Soil_Type39',
    'Soil_Type40',
    'Cover_Type',

]

In [None]:
df = pd.read_csv('covtype.csv', names  = cols)
soils = ['Soil_Type'+str(i) for i in range(1,41)]
df['Soil_Type'] = np.argmax(df[soils].values, axis = 1)
df = df.drop(columns = soils)
y = df['Cover_Type'].values
df = df.drop(columns = 'Cover_Type')

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

## Boosting baseline

In [None]:
X_train.shape

In [None]:
import lightgbm as lgba
clf = lgb.LGBMClassifier(max_depth = -1, n_estimators = 1000, n_jobs = 12, silent = False, 
                         )

clf.fit(X_train, y_train, eval_set =(X_test, y_test), categorical_feature = ['Soil_Type'])

In [None]:
from sklearn.metrics import accuracy_score
pred = clf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(learning_rate=0.1, n_estimators=5000, early_stopping_rounds = 15, task_type="GPU")
clf.fit(X_train, y_train, eval_set =(X_test, y_test), cat_features = ['Soil_Type'])

In [None]:
from sklearn.metrics import accuracy_score
pred = clf.predict(X_test)
accuracy_score(y_test, pred)

## Baseline NN

In [None]:
Y_train = tf.keras.utils.to_categorical(y_train)
Y_test = tf.keras.utils.to_categorical(y_test)

In [None]:
class Tabular(tf.keras.layers.Layer):
    def __init__(self, hidden_dimension, output_dimension, batch_momentum, rate = 0.1):
        super().__init__()
        
        self.fc1 = tf.keras.layers.Dense(hidden_dimension, activation = 'relu')
        self.fc2 = tf.keras.layers.Dense(hidden_dimension, activation = 'relu')
        self.fc3 = tf.keras.layers.Dense(output_dimension, activation = 'relu')
        
        self.bn1 = tf.keras.layers.BatchNormalization(momentum=batch_momentum)
        self.bn2 = tf.keras.layers.BatchNormalization(momentum=batch_momentum)
        self.bn3 = tf.keras.layers.BatchNormalization(momentum=batch_momentum)
        
        self.drop = tf.keras.layers.Dropout(rate)
        
    def call(self, features, training = True):
        x = self.fc1(features)
        x = self.bn1(x)
        if training:
            x = self.drop(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        if training:
            x = self.drop(x)
        
        x = self.fc3(x)
        x = self.bn3(x)
        if training:
            x = self.drop(x)
        return x
        

def find_tensor_mask(tensor, n_keep_features, temperature = 100):
    k_best, _ = tf.math.top_k(tensor, k=n_keep_features+1, sorted=False)
    treshold =(k_best[:,-1] + k_best[:,-2]) / 2
    reshaped_tres = tf.repeat(tf.expand_dims(treshold, axis = 1), tf.shape(tensor)[1], axis = 1)
    tensor_tres = (tensor - reshaped_tres)
    reduced_mask = tf.reduce_mean(tensor_tres, axis = 0)
    mask = tf.repeat(tf.expand_dims(reduced_mask, axis = 0), tf.shape(tensor)[0], axis = 0)*temperature
    mask = tf.keras.activations.sigmoid(mask)
    return mask

class TreeTabular(tf.keras.layers.Layer):
    def __init__(self, num_steps = 4, hidden_dim = 128, input_dim = 64,
                 output_dim = 64, n_feature_per_steps = 20, 
                 temperature = 100, batch_momentum = 0.7, dropout_rate = 0.1):
        super().__init__()
        
        assert output_dim % num_steps == 0 , 'the output dimension must be a multiple of number of steps' 
        
        self.solo_output_dim = int(output_dim / num_steps)
        self.num_steps = num_steps
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_feature_per_steps = n_feature_per_steps
        self.temperature = temperature
        self.batch_momentum = batch_momentum
        self.dropout_rate = dropout_rate
        
        self.mask_classif1 = [Tabular(hidden_dim, input_dim, batch_momentum, rate = dropout_rate) for _ in range(num_steps)]
#         self.mask_classif2 = [Tabular(hidden_dim, input_dim, batch_momentum, rate = dropout_rate) for _ in range(num_steps)]
        
        self.feature_builder1 = [Tabular(hidden_dim, self.solo_output_dim, batch_momentum, rate = dropout_rate) for _ in range(num_steps)]
        self.feature_builder2 = [Tabular(hidden_dim, self.solo_output_dim, batch_momentum, rate = dropout_rate) for _ in range(num_steps)]
        
        self.smooth_layer = Tabular(hidden_dim, output_dim, batch_momentum, rate = dropout_rate)
        
    def call(self, features, training = True):
        
        output = None
        
        for step in range(self.num_steps):
            
            mask = self.mask_classif1[step](features, training = training)
#             mask = self.mask_classif2[step](mask, training = training)
            mask = find_tensor_mask(mask, self.n_feature_per_steps, temperature = self.temperature)
            
            masked_features = features * mask
            
            out = self.feature_builder1[step](masked_features, training = training)
            out = self.feature_builder2[step](out, training = training)
            
            if output is None:
                output = out
            else:
                output = tf.keras.layers.Concatenate(axis = -1)([output, out])
                
        output = self.smooth_layer(output)    
        return output

In [None]:
Y_train.shape

In [None]:
X_train_r = [X_train.values[:,:-1], X_train.values[:,-1].reshape(-1,1)]
X_test_r = [X_test.values[:,:-1], X_test.values[:,-1].reshape(-1,1)]

In [None]:
from tabularnn import *
inputs_num = tf.keras.Input(shape = (14,))
inputs_cat = tf.keras.Input(shape = (1,))

inputs = [inputs_num,inputs_cat]

cat_emb = tf.keras.layers.Embedding(40, 10)(inputs_cat)[:,0,:]

agg = tf.keras.layers.Concatenate(axis = -1)([cat_emb, inputs_num])

# encoder = TreeTabular(num_steps = 4, hidden_dim = 128, input_dim = 24,
#                  output_dim = 64, n_feature_per_steps = 10, 
#                  temperature = 100, batch_momentum = 0.7, dropout_rate = 0.1)


node = NODE(n_layers=1, units=7, depth=6, n_trees=512, link=tf.keras.activations.softmax)
encoded = node(agg)
# encoder = Tabular(hidden_dimension = 128, output_dimension = 64, batch_momentum = 0.7, rate = 0.1)

# encoded = encoder(agg, training = True)

pred = tf.keras.layers.Dense(8, activation = 'softmax')(encoded)
model = tf.keras.Model(inputs, pred)

In [None]:
from tabnet2 import *
import tensorflow as tf
tabnet = TabNet(
        num_features = 24,
        feature_dim = 64,
        output_dim = 64,
        feature_columns = None,
        n_step = 5,
        n_total = 5,
        n_shared = 2,
        relaxation_factor = 1.5,
        bn_epsilon = 1e-5,
        bn_momentum = 0.7,
        bn_virtual_divider = 10,
    )
# tabnet.build((None, 24))
inputs_num = tf.keras.Input(shape = (14,))
inputs_cat = tf.keras.Input(shape = (1,))
inputs = [inputs_num,inputs_cat]
cat_emb = tf.keras.layers.Embedding(40, 10)(inputs_cat, training = True)[:,0,:]
agg = tf.keras.layers.Concatenate(axis = -1)([cat_emb, inputs_num])
encoded, masks = tabnet(agg)
pred = tf.keras.layers.Dense(8, activation = 'softmax')(encoded)
model = tf.keras.Model(inputs, pred)

In [None]:
model.summary()

In [None]:
25000/10

In [None]:
from tensorflow.keras.optimizers import Adam, SGD
model.compile(
        loss = 'categorical_crossentropy',
        optimizer = Adam(0.01),
        metrics = ['accuracy'])

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=9, verbose=1, 
                                                mode='auto', restore_best_weights=True)

reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, 
                           mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

callbacks =[early, reduce]
batch_size = 25000
epochs = 100

ls = X_train_r[0].shape[0] // batch_size * batch_size
lt = X_test_r[0].shape[0] // batch_size * batch_size

model.fit([elt[:ls] for elt in X_train_r], Y_train[:ls], validation_data = ([elt[:lt] for elt in X_test_r], Y_test[:lt]), batch_size=batch_size, epochs=epochs, callbacks = callbacks)

In [None]:
from sklearn.metrics import accuracy_score

pred = model.predict(X_test_r, verbose = 1, batch_size = 6400)



pred = np.argmax(pred, axis = -1)
accuracy_score(y_test, pred)

In [None]:
## Baseline LGB 0.8628
## Baseline CatBoost 0.9424

## Baseline NN sans embeddings  0.818
## Baseline NN avec embeddings  0.8569
## Baseline TreeTab avec embeddings 0.8784
## Baseline Tabnet avec embeddings 0.9575
## Baseline Node avec embeddings


In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

clf = TabNetClassifier(cat_idxs = [14])  #TabNetRegressor()
clf.fit(
  X_train.values, y_train,
  eval_set=[(X_test.values, y_test)]
)
pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, pred)