In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn.functional as F

In [2]:
data1 = np.load('/kaggle/input/da5401/embeddings_1.npy')
data2 = np.load('/kaggle/input/da5401/embeddings_2.npy')

X = np.concatenate((data1, data2), axis=0)


In [3]:
X.shape

(198982, 1024)

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

with open('/kaggle/input/da5401/icd_codes_1.txt', 'r') as file1, open('/kaggle/input/da5401/icd_codes_2.txt', 'r') as file2:
    y1 = [line.strip().split(';') for line in file1]
    y2 = [line.strip().split(';') for line in file2]

y_combined = y1 + y2

mlb = MultiLabelBinarizer()
mlb.fit(y_combined)

y_encoded_1 = mlb.transform(y1)
y_encoded_2 = mlb.transform(y2)

y_encoded = np.concatenate((y_encoded_1, y_encoded_2), axis=0)



In [5]:
y_encoded.shape

(198982, 1400)

In [6]:
X.shape

(198982, 1024)

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

In [8]:
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y_encoded, dtype=torch.float32)

In [9]:
train_dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleAttentionCNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleAttentionCNN, self).__init__()
        
        # Single convolution layer
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Sigmoid()
        )
        
        # Fully connected layers for output
        self.fc1 = nn.Linear(input_dim * 64, 1024)
        self.bn_fc1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, output_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # Reshape input to add channel dimension for CNN
        x = x.unsqueeze(1)
        
        # Convolutional layer
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        
        # Attention mechanism
        attn_weights = self.attention(x.mean(dim=2))
        x = x * attn_weights.unsqueeze(2)
        
        # Flatten the output for the fully connected layer
        x = x.view(x.size(0), -1)
        
        # Fully connected layer
        x = self.fc1(x)
        x = self.bn_fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Output layer with sigmoid activation for multi-label classification
        x = torch.sigmoid(self.fc2(x))
        
        return x


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
input_dim = X.shape[1]
output_dim = y_encoded.shape[1]
model = SimpleAttentionCNN(input_dim, output_dim).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [14]:
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/15], Loss: 0.0021
Epoch [2/15], Loss: 0.0027
Epoch [3/15], Loss: 0.0021
Epoch [4/15], Loss: 0.0017
Epoch [5/15], Loss: 0.0017
Epoch [6/15], Loss: 0.0013
Epoch [7/15], Loss: 0.0012
Epoch [8/15], Loss: 0.0008
Epoch [9/15], Loss: 0.0013
Epoch [10/15], Loss: 0.0016
Epoch [11/15], Loss: 0.0014
Epoch [12/15], Loss: 0.0014
Epoch [13/15], Loss: 0.0040
Epoch [14/15], Loss: 0.0040
Epoch [15/15], Loss: 0.0006


In [15]:
test = np.load('/kaggle/input/da5401/test_data.npy')

In [16]:
test.shape

(99490, 1024)

In [17]:
test = scaler.transform(test)

In [18]:
test = torch.tensor(test, dtype=torch.float32).to(device)

In [19]:
from torch.utils.data import DataLoader, TensorDataset

# Assuming test is a tensor containing your test data
batch_size = 64  # Adjust the batch size based on available memory
test_dataset = TensorDataset(test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Move model to evaluation mode
model.eval()
y_pred_all = []

with torch.no_grad():
    for batch in test_loader:
        batch = batch[0].cuda()  # Move batch to GPU, if using CUDA
        y_pred_prob = model(batch)
        y_pred_all.append((y_pred_prob > 0.5).float().cpu())  # Move predictions to CPU if needed

# Concatenate all batch predictions
y_pred = torch.cat(y_pred_all, dim=0)


In [20]:
predictions = mlb.inverse_transform(y_pred.cpu().numpy())

In [21]:
len(predictions)

99490

In [23]:


# Initialize a list to hold formatted rows
formatted_data = []

# Process each prediction
for idx, codes in enumerate(predictions, start=1):
    # Sort the codes lexicographically and join with ';'
    sorted_codes = ";".join(sorted(codes))
    # Add a dictionary entry with 'id' and 'labels'
    formatted_data.append({"id": idx, "labels": sorted_codes})

# Convert the list to a DataFrame
submission_df = pd.DataFrame(formatted_data)

# Write to CSV file
submission_df.to_csv('submission.csv', index=False)
