In [None]:
# Loading datasets and libraries

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from numpy import random
#from google.colab import files

!pip install torcheval
from torcheval.metrics.functional import multiclass_f1_score
!pip install imblearn
from imblearn.over_sampling import RandomOverSampler # , SMOTE

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torcheval
  Downloading torcheval-0.0.6-py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.4/158.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchtnt>=0.0.5
  Downloading torchtnt-0.1.0-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.9/87.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyre-extensions
  Downloading pyre_extensions-0.0.30-py3-none-any.whl (12 kB)
Collecting typing-inspect
  Downloading typing_inspect-0.8.0-py3-none-any.whl (8.7 kB)
Collecting mypy-extensions>=0.3.0
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, typing-inspect, pyre-extensions, torchtnt, torcheval
Successfully installed mypy-extensions-1.0.0 pyre-extensions-0.0.30 torcheval-0.0.6 torchtnt-0.1.0 typing-inspect-0.8.0
Loo

In [None]:
# Load dataset

In [31]:
y = np.loadtxt("drive/MyDrive/beforesmotey")
X = np.loadtxt("drive/MyDrive/beforesmote")

In [32]:
class RNN(nn.Module):
    def __init__(self, hidden_dim,mlp_dim, drop):
   
        super().__init__()
        self.hidden_dim = hidden_dim
        # Attention
        self.attention = nn.MultiheadAttention(embed_dim = hidden_dim, num_heads = 1)
        # LSTM layer
        self.rnn = nn.LSTM(input_size = 1, hidden_size = hidden_dim, num_layers = 1)
        # last, fully-connected layer
        self.fc1 = nn.Linear(hidden_dim, mlp_dim) 
        self.fc2 = nn.Linear(mlp_dim, 5)
        self.logsoftmax = nn.LogSoftmax() 
        self.layer = nn.LayerNorm(hidden_dim)
        # Capa dropout 
        self.dropout = nn.Dropout(p=drop)
        self.relu = nn.ReLU()

    def forward(self, x):

        # LSTM
        _, last_state_tup =  self.rnn(x)
        last_state = last_state_tup[0]
        # Attention
        x = self.attention(last_state,last_state,last_state,need_weights = False)[0]
        # Add
        #x += last_state

        # Normalize
        x = self.layer(x)

        # MLP with dropout and relu
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        # Softmax - not tunable
        output = self.logsoftmax(x)
        return output

    def set_type_test(self, x):

        # Dont change it!
        x = np.trim_zeros(x, 'b')
        x = torch.Tensor(x).view(-1,1)
        x = x.to(torch.float32)
        return x

In [33]:
class RNN_with_train(RNN):
    
    def __init__(self,hidden_dim, mlp_dim, drop=0.5,weights =[ 1.0,2.3,5,9,5 ] ,batch_size=32,lr=0.0001,saved_files='drive/MyDrive/saved_models/'):
        # weights must be floats!

        super().__init__(hidden_dim, mlp_dim, drop)  
        
        self.lr = lr # Learning Rate
  
        self.optim = optim.Adam(self.parameters(), self.lr) # Optimizer
        
        self.criterion = nn.NLLLoss(weight = torch.tensor(weights))              
        
        self.loss_during_training = [] 

        self.valid_loss_during_training = [] 
        
        self.batch_size = batch_size
        
        self.saved_files = saved_files

        # difficult to get it working
        #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        #self.to(self.device)
            
    def trainloop(self, X, Y, valid = False, X_val= None, Y_val= None,epochs= 40,print_every=1):
        
        self.print_every = print_every
        self.epochs=epochs
        n_batch = int(X.shape[0]/self.batch_size)

        for e in range(int(self.epochs)):

          self.train() # Activate dropout
          # random permutation (optional)
          id_perm = np.random.permutation(np.arange(X.shape[0]))
          for nul in range(n_batch):
            # either one - look up random permutation
            idx = id_perm[nul*self.batch_size:(nul+1)*self.batch_size]
            #idx = random.choice(np.arange(X.shape[0]), self.batch_size) # choose element to train given a label
            labels = torch.Tensor(Y[idx]).type(torch.LongTensor)
            #labels.to(self.device)
            outputs = torch.zeros(self.batch_size,5)
            k = 0
            for b in idx:

              running_loss = 0.
              self.optim.zero_grad() 
              x = X[b,:]
              x = np.trim_zeros(x, 'b')
              x = torch.Tensor(x).view(-1,1)
              x = x.to(torch.float32)
              outputs[k, :] = self.forward(x)
              k += 1
              #print([out, class_number])
            #outputs.to(self.device)
            loss = self.criterion(outputs ,labels)
            running_loss += loss.item()
            loss.backward()
            # Gradient clipping
            nn.utils.clip_grad_norm_(self.parameters(), 2.0)
            # SGD steps
            self.optim.step()


          self.loss_during_training.append(running_loss) # no self batch size to better see
          if valid:
            self.eval()

            with torch.no_grad():
              labels = torch.Tensor(Y_val).type(torch.LongTensor)
              #labels = labels.to(self.device)
              outputs = torch.zeros(X_val.shape[0],5)
              for k in range(X_val.shape[0]):
                outputs[k, :] = self.forward_test(X_val[k,:])
              #outputs.to(self.device)
              loss = self.criterion(outputs ,labels)
              running_loss += loss.item()
              self.valid_loss_during_training.append(running_loss) # no self batch size to better see
              val_F1 = multiclass_f1_score(outputs, labels, num_classes=5, average = "macro")
          if(e % self.print_every == 0):
            if valid:
              print(f"Training loss after {e+1} epochs: {self.loss_during_training[-1]}, F1:{val_F1}, valid:{self.valid_loss_during_training[-1]}")
            else:
              print(f"Training loss after {e+1} epochs: {self.loss_during_training[-1]}")
            # We save model parameters  
            torch.save(self.state_dict(), self.saved_files+'_epoch_'+str(e+1)+'.pth')

    def evaluate(self, X, load = None, save= None):
      
      if load is not None:
        state_dict = torch.load(load)
        self.load_state_dict(state_dict)
      with torch.no_grad():
        self.eval()
        pred_classes = [torch.argmax(self.forward(self.set_type_test(X[k,:]))) for k in range(X.shape[0])]
        pred_classes = np.array(pred_classes)
        ids = np.arange(len(pred_classes))
        results = pd.DataFrame({'ID': ids, 'Pred_Class': pred_classes})
        if save is not None:
          results.to_csv(save, index=False)
      return results



In [None]:
#rnn = RNN_with_train(64,16, lr = 0.001, batch_size = 64) 
rnn.trainloop(X, y,False, epochs=10, print_every=1) # idk how many epochs

  output = self.logsoftmax(x)


In [None]:
# Test dataset preproccesing

In [6]:
df2 = pd.read_csv('drive/MyDrive/Test_set.csv')
df2.fillna(0, inplace = True)
ids = df2['ID']
X_test = df2.drop(['ID'], axis=1)
X_test = np.array(X_test)

def expand_linear(time_series):
  left = 0
  right = 0
  for k in range(1, len(time_series)):
    if time_series[k] == 0:
      left = time_series[k-1]
      l = 0
      while time_series[k+l] == 0:
        l+=1
        if len(time_series) == k+l:
          return time_series
      right = time_series[k+l]
      time_series[k: k+l] = np.linspace(left, right, num=l+2)[1:(l+1)]
  return time_series
  # Scale but no 0s
#nonX_test = X_test.reshape(-1)
#nonX_test = nonX_test[nonX_test!=0]
mean = 0.2949385572269969
std = 0.22667829935749878
for r in range(X_test.shape[0]):
  for c in range(X_test.shape[1]):
    if X_test[r,c] != 0:
      X_test[r,c] -= mean
X_test = X_test/std

for k in range(X_test.shape[0]):
  expand_linear(X_test[k,:])

# X_test is the data after preprocessing in a numpy array

In [None]:
# Evaluation

In [28]:
rnn = RNN_with_train(128,64, lr = 0.0003, batch_size = 32) 
rnn.evaluate(X_test, load = "drive/MyDrive/saved_models/_epoch_8.pth", save = 'works.csv')

In [None]:
# Comparison to the best solution 
# You need at least 0.88 score to upload to kaggle 

In [13]:
gabriele = pd.read_csv('sample_submission_deep.csv')
print(sum(gabriele["Pred_Class"] == results["Pred_Class"])) 
ratio_gabriele = gabriele["Pred_Class"].value_counts()
#ratio = np.array(pd.Series(y).value_counts()/X.shape[0]*22000)
ratio_est = results["Pred_Class"].value_counts()
df_ratio = pd.DataFrame({"gabriele":ratio_gabriele, "current": ratio_est})
print(multiclass_f1_score(torch.tensor(results["Pred_Class"]), torch.tensor(gabriele["Pred_Class"]), num_classes=5, average = "macro"))
df_ratio