In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
#Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
#Extract features and labels
X = train_data.drop(columns=['id', 'label'])
y = train_data['label']
test_X = test_data.drop(columns=['id'])
training_dataset = train_data.drop(['id'],axis=1,inplace=False)
test_dataset = test_data.drop(['id'],axis=1,inplace=False)

In [4]:
#Classifier List
classifiers = [
     XGBClassifier(learning_rate=0.3,n_estimators=200,max_depth=7,min_child_weight=13,gamma=0,subsample=0.8,colsample_bytree=0.8,objective='binary:logistic',nthread=8,scale_pos_weight=1,eval_metric='auc'),
    RandomForestClassifier(n_estimators=200, max_depth=50, min_samples_split=10, min_samples_leaf=1,n_jobs=8)
]

In [5]:
#mlp
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
            super(MLPClassifier, self).__init__()
            self.input = nn.Sequential(
                nn.Embedding(input_size, hidden_size),
            )
            self.hidden = nn.Sequential(
                nn.Dropout(0.2),
                nn.Linear(hidden_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, hidden_size // 2),
                nn.ReLU()
            )

            self.output = nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, output_size),
                nn.Sigmoid()
            )

    def forward(self, data1, data2):
        data1 = self.hidden(self.input(data1))
        data2 = self.hidden(self.input(data2))
        data = torch.cat((data1, data2), dim=1)
        label = self.output(data).squeeze()
        return label

In [6]:
# pytorch data
class NodeDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data1 = self.data.iloc[index]['id1']
        data2 = self.data.iloc[index]['id2']
        label = self.data.iloc[index]['label']
        return data1,data2,label

In [7]:
# Create a data loader
batch_size = 2048
train_dataloader = DataLoader(NodeDataset(training_dataset), batch_size=batch_size, shuffle=True)
#Set network parameters
input_size = training_dataset.shape[0]
hidden_size = 128
output_size = 1

In [None]:
#gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [9]:
# mlp
mlp_clf = MLPClassifier(input_size, hidden_size, output_size).to(device)
# loss and adam
criterion = nn.BCELoss()
optimizer = optim.Adam(mlp_clf.parameters(), lr=0.001)

In [None]:
# train mlp
num_epochs = 10
mlp_clf.train()
for epoch in range(num_epochs):
    for i, (data1, data2, labels) in enumerate(train_dataloader):
        data1 = data1.to(device)
        data2 = data2.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = mlp_clf(data1,data2)
        loss = criterion(outputs , labels.float())
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.10f}'
                  .format(epoch+1, num_epochs, i*len(data1), len(train_dataloader.dataset),loss.item()))

In [11]:
class NodeTestset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data1 = self.data.iloc[index]['id1']
        data2 = self.data.iloc[index]['id2']
        return data1,data2
test_dataloader = DataLoader(NodeTestset(test_dataset), batch_size=batch_size, shuffle=False)

In [12]:
#mlp pred
mlp_clf.eval()
y_pred_list = []
with torch.no_grad():
    for i, (data1, data2) in enumerate(test_dataloader):
        data1 = data1.to(device)
        data2 = data2.to(device)
        output = mlp_clf(data1, data2)
        output = torch.round(output)
        y_pred_list += output.tolist()

In [None]:
#train xgb and rf
for idx, classifier in enumerate(classifiers):
        print(f'train {str(classifier)}')
        classifier.fit(X, y)

In [14]:
all_val_preds = []
weights = [0.4, 0.2, 0.4]
for idx, classifier in enumerate(classifiers):
    y_val_pred = classifier.predict(test_X)
    all_val_preds.append(y_val_pred)
all_val_preds.append(y_pred_list)
#stack preds
stacked_val_preds = np.column_stack(all_val_preds)
weighted_votes = np.zeros_like(stacked_val_preds, dtype=float)
for col_idx, weight in enumerate(weights):
    weighted_votes[:, col_idx] = stacked_val_preds[:, col_idx] * weight
weighted_sum = np.sum(weighted_votes, axis=1) / np.sum(weights)  # Devide by sum of weights as we are not using already normalized weights

In [15]:
threshold = 0.5
test_weighted_vote = np.where(weighted_sum > threshold, 1, 0)
# finally pred
test_data['label'] = test_weighted_vote.astype(int)
test_final = test_data.drop(columns=['id1','id2'])

In [16]:
#save pred
test_final.to_csv('submission.csv', index=False)