# Dataset and Dataloader

In [3]:
import numpy as np
import pandas as pd
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import *

In [46]:
class CancerDataset(Dataset):
    
    # constructor
    def __init__(self, df_X, df_y):
        assert len(df_X) == len(df_y)
        
        self.data_list = torch.from_numpy(df_X.values).float()
        self.target_list = torch.from_numpy(df_y.values).float()
        
        assert(len(self.data_list) == len(self.target_list))
      
    # return the length of dataset
    def __len__(self):
        return len(self.data_list)
    
    # return the key-th element of dataset
    def __getitem__(self, key):
        
        return self.data_list[key], self.target_list[key]
    

In [47]:
df = pd.read_csv('train_ml2_2021.csv')
X = df.drop(columns=['target', 'problem_id'])
y = df.target

In [48]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)

In [49]:
train_dataset = CancerDataset(X_train, y_train)
valid_dataset = CancerDataset(X_val, y_val)

batch_size=100
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [50]:
class CancerPredictor(nn.Module):
    
    # constructor
    def __init__(self, input_size, hidden_size, n_class):
        super().__init__()
        
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.input_relu = nn.ReLU()
        self.hidden_linear = nn.Linear(hidden_size, n_class)
        self.soft_max = nn.Softmax(dim=-1)

    
    def forward(self, inputs):
        out = None
        out = self.input_layer(inputs)
        out = self.input_relu(out)
        out = self.hidden_linear(out)
        out = self.soft_max(out)
        
        return out


In [51]:
n, input_size = X.shape
hidden_size = 300
n_class = len(y.unique())

In [52]:
model = CancerPredictor(input_size, hidden_size, n_class)

In [53]:
for i, (data_batch, target_batch) in enumerate(train_loader):
    device = "cpu"
    preds = model(data_batch.to(device))
    print(preds)
    # print(model(data_batch))
    input()

tensor([[0.1885, 0.2037, 0.1977, 0.1524, 0.2577],
        [0.1804, 0.2021, 0.2144, 0.1530, 0.2500],
        [0.2122, 0.1918, 0.2156, 0.1587, 0.2217],
        [0.2012, 0.1908, 0.2166, 0.1554, 0.2359],
        [0.1996, 0.1982, 0.2286, 0.1748, 0.1989],
        [0.2013, 0.1896, 0.1967, 0.1873, 0.2251],
        [0.2097, 0.1967, 0.2085, 0.1722, 0.2130],
        [0.2128, 0.2110, 0.2108, 0.1580, 0.2075],
        [0.1972, 0.2008, 0.1919, 0.1682, 0.2419],
        [0.1933, 0.1989, 0.2190, 0.1635, 0.2252],
        [0.2158, 0.1976, 0.1980, 0.1584, 0.2301],
        [0.1917, 0.1897, 0.2274, 0.1661, 0.2251],
        [0.1875, 0.1942, 0.2034, 0.1785, 0.2365],
        [0.2170, 0.1788, 0.1939, 0.1787, 0.2317],
        [0.1979, 0.2063, 0.1824, 0.1757, 0.2377],
        [0.2023, 0.1922, 0.2162, 0.1670, 0.2223],
        [0.2141, 0.2094, 0.1969, 0.1594, 0.2201],
        [0.1974, 0.2051, 0.1928, 0.1700, 0.2346],
        [0.1859, 0.1814, 0.2118, 0.1614, 0.2595],
        [0.1996, 0.1940, 0.2034, 0.1761, 0.2270],


KeyboardInterrupt: Interrupted by user