In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import  f1_score

In [11]:
#Load data
train_data = pd.read_csv('train.csv')

In [12]:
#Extract features and labels
X = train_data.drop(columns=['id', 'label'])
y = train_data['label']
training_dataset = train_data.drop(['id'],axis=1,inplace=False)

In [13]:
#Classifier List
xgb_clf = XGBClassifier(
    learning_rate=0.3,
    n_estimators=200,
    max_depth=7,
    min_child_weight=13,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=8,
    scale_pos_weight=1,
    early_stopping_rounds=30,
    eval_metric='auc')
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=50,
    min_samples_split=10,
    min_samples_leaf=1,
    n_jobs=8)


In [14]:
# MLP
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
            super(MLPClassifier, self).__init__()
            self.input = nn.Sequential(
                nn.Embedding(input_size, hidden_size),
            )
            self.hidden = nn.Sequential(
                nn.Dropout(0.2),
                nn.Linear(hidden_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, hidden_size // 2),
                nn.ReLU()
            )

            self.output = nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, output_size),
                nn.Sigmoid()
            )

    def forward(self, data1, data2):
        data1 = self.hidden(self.input(data1))
        data2 = self.hidden(self.input(data2))
        data = torch.cat((data1, data2), dim=1)
        label = self.output(data).squeeze()
        return label

In [15]:
#pytorch data
class NodeDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data1 = self.data.iloc[index]['id1']
        data2 = self.data.iloc[index]['id2']
        label = self.data.iloc[index]['label']
        return data1,data2,label

In [16]:
# Create a data loader
batch_size = 2048
train_dataloader = DataLoader(NodeDataset(training_dataset), batch_size=batch_size, shuffle=True)
#Set network parameters
input_size = training_dataset.shape[0]
hidden_size = 128
output_size = 1

In [None]:
#GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Five flod cross-validation
kf = KFold(n_splits=5, shuffle=True)
f1_scores = []
xgb_f1 = []
rf_f1 = []
mlp_f1 = []
for fold, (train_index, valid_index) in enumerate(kf.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    y_valid_preds = []
    print(f'Fold {fold+1}')
    #xgb
    xgb_clf.fit(X_train, y_train,eval_set=[(X_valid,y_valid)])
    #xgb-pred
    y_valid_pred = xgb_clf.predict(X_valid)
    y_valid_preds.append(y_valid_pred)
    #f1-score
    f1 = f1_score(y_valid, y_valid_pred)
    xgb_f1.append(f1)
    print(f"fold:[{fold+1}/5],xgb-f1={f1}\n")
    #rf
    rf.fit(X_train,y_train)
    #rf-pred
    y_valid_pred = rf.predict(X_valid)
    y_valid_preds.append(y_valid_pred)
    #f1-score
    f1 = f1_score(y_valid, y_valid_pred)
    rf_f1.append(f1)
    print(f"fold:[{fold+1}/5],rf-f1={f1}\n")
    # mlp
    mlp_clf = MLPClassifier(input_size, hidden_size, output_size).to(device)
    # loss and adam
    criterion = nn.BCELoss()
    optimizer = optim.Adam(mlp_clf.parameters(), lr=0.001)
    #Transforming an array into a two-dimensional array
    y_trains = pd.DataFrame(y_train.to_numpy().reshape(-1,1),columns = ['label'])
    y_valids = pd.DataFrame(y_valid.to_numpy().reshape(-1,1),columns = ['label'])
    #Divide training and validation sets
    train_data = np.concatenate((X_train,y_trains), axis=1)
    train_dataset = pd.DataFrame(train_data,columns=['id1','id2','label'])
    valid_data = np.concatenate((X_valid,y_valids), axis=1)
    valid_dataset = pd.DataFrame(valid_data,columns=['id1','id2','label'])
    #Create a data loader
    train_dataloader = DataLoader(NodeDataset(train_dataset), batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(NodeDataset(valid_dataset), batch_size=batch_size, shuffle=False)
    #train mlp
    num_epochs = 10
    mlp_clf.train()
    for epoch in range(num_epochs):
        for i, (data1, data2, labels) in enumerate(train_dataloader):
            data1 = data1.to(device)
            data2 = data2.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = mlp_clf(data1,data2)
            loss = criterion(outputs , labels.float())
            loss.backward()
            optimizer.step()
            if i % 10 == 0:
                print('fold [{}/{}],Epoch [{}/{}], Step [{}/{}], Loss: {:.10f}'
                      .format(fold+1, 5, epoch+1, num_epochs, i*len(data1), len(train_dataloader.dataset), loss.item()))
    #eval mlp
    mlp_clf.eval()
    y_pred_list = []
    y_true_list = []
    with torch.no_grad():
        for i, (data1, data2, labels) in enumerate(valid_dataloader):
            data1 = data1.to(device)
            data2 = data2.to(device)
            labels = labels.to(device)

            output = mlp_clf(data1, data2)
            output = torch.round(output)
            y_pred_list += output.tolist()
            y_true_list.extend(labels.cpu().numpy())
    # f1-score
    f1 = f1_score(y_true_list, y_pred_list)
    mlp_f1.append(f1)
    print(f"fold:[{fold+1}/5],mlp-f1={f1}\n")
    y_valid_preds.append(y_pred_list)
    weights = [0.4, 0.2, 0.4]
    #stack preds
    stacked_val_preds = np.column_stack(y_valid_preds)
    weighted_votes = np.zeros_like(stacked_val_preds, dtype=float)
    for col_idx, weight in enumerate(weights):
        weighted_votes[:, col_idx] = stacked_val_preds[:, col_idx] * weight
    weighted_sum = np.sum(weighted_votes, axis=1) / np.sum(weights)
    threshold = 0.5
    y_valid_pred = np.where(weighted_sum > threshold, 1, 0)
     #f1-score
    f1 = f1_score(y_valid, y_valid_pred)
    f1_scores.append(f1)
    print(f"Accuracy scores in {fold+1}-fold cross validation:{f1}")
#print f1-score
print("xgb accuracy scores in 5-fold cross validation:", xgb_f1)
print("Mean accuracy in 5-fold cross validation:", np.mean(xgb_f1))
print("rf accuracy scores in 5-fold cross validation:", rf_f1)
print("Mean accuracy in 5-fold cross validation:", np.mean(rf_f1))
print("mlp accuracy scores in 5-fold cross validation:", mlp_f1)
print("Mean accuracy in 5-fold cross validation:", np.mean(mlp_f1))
print("Accuracy scores in 5-fold cross validation:", f1_scores)
print("Mean accuracy in 5-fold cross validation:", np.mean(f1_scores))