# Baseline Code for HW1

This is just the baseline code to set up the basic function you need. You need to modify the code yourself to achieve a better result.

## Import packages you need

In [1]:
# import package 
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import csv

In [2]:
# Setting seeds
myseed=666
torch.manual_seed(myseed)
# if you use numpy
np.random.seed(myseed)

## Basic Function
Do not modify this part

In [3]:
class EarlyStopper(object):
    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_loss = 1000000.0
        self.save_path = save_path

    def is_continuable(self, model, loss):
        if loss < self.best_loss:
            self.best_loss = loss
            self.trial_counter = 0
            torch.save(model.state_dict(), self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False

def cal_loss(model, loader):
    pres = []
    labels = []
    for field, label in loader:
        field, label = field.float(), label.float()
        prediction = model(field)
        pres.append(prediction)
        labels.append(label)
    pres = torch.cat(pres, dim=0)
    labels = torch.cat(labels, dim=0)
    loss = criterion(pres, labels)
    return loss

def predict(test_loader, model):
    pres = []
    for field in test_loader:
        field = field.float()
        prediction = model(field)
        pres.append(prediction)
    pres = torch.cat(pres, dim=0)
    return pres.detach().numpy()


In [4]:
# Loading dataset
# class ReadDataset(Dataset):
#     def __init__(self, path, is_test=False):
#         super().__init__()
#         self.is_test = is_test
#         self.field = pd.read_csv(path, index_col=0)
#         self.field.dropna(axis=0, how='any', inplace=True)
#         assert not self.field.isnull().values.any()
#         self.field = self.field.values
#         if not is_test:
#             self.label = self.field[:,-1]
#             self.field = self.field[:,:-1]

#     def __len__(self):
#         return len(self.field)

#     def __getitem__(self, item):
#         field = self.field[item]
#         if not self.is_test:
#             label = self.label[item]
#             return field, label
#         return field

In [57]:
class ReadDataset(Dataset):
    def __init__(self, path, is_test=False, mean=None, std=None):
        super().__init__()
        self.is_test = is_test
        self.field = pd.read_csv(path, index_col=0)
        self.field.dropna(axis=0, how='any', inplace=True)
        assert not self.field.isnull().values.any()
        
        if not is_test:
            self.labels = self.field.values[:,-1]
            self.features = self.field.values[:,:-1]
            self.mean = np.mean(self.features, axis=0)
            self.std = np.std(self.features, axis=0)
        else:
            assert mean is not None and std is not None, "Mean and std must be provided for test fieldset."
            self.features = self.field.values
            self.labels = None 
            self.mean = mean
            self.std = std

    def __len__(self):
        return len(self.features)

    def __getitem__(self, item):
        normalized_features = (self.features[item] - self.mean) / self.std
        if not self.is_test:
            label = self.labels[item]
            return normalized_features, label
        return normalized_features

## Define DNN by pytorch

In [58]:
class Net(nn.Module):
    def __init__(self, neurons, selected_idx):
        super(Net, self).__init__()
        # Feature selection, only use partial cols for modelling
        self.selected_idx = selected_idx
        
        # 1 hidden layer
        self.fc1 = nn.Linear(len(selected_idx), neurons)  
        self.relu1 = nn.ReLU()
        
#         self.dropout = nn.Dropout(0.1)
        
#         self.fc2 = nn.Linear(30, 20)  
#         self.relu2 = nn.ReLU()
        
        # Output layer, sigmoid probability
        self.fc3 = nn.Linear(neurons, 1)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        x = x[:, self.selected_idx]
        x = self.fc1(x)
        x = self.relu1(x)
         
#         x = self.dropout(x)    
#         x = self.fc2(x)
#         x = self.relu2(x)
     
        x = self.fc3(x)
        x = self.sig(x).squeeze() 
        return x

# Selecting Appropriate Hyperparameters

In [59]:
# hyper-parameters
# your batch size
batch_size = 1500
# your learning rate
lr = 0.005
# a parameter used for splitting train and validation set
split_ratio = 0.9
# maximum training epochs
epochs = 500
# early stop step in training
num_trials = int(epochs * 0.5)
# path for your trained model
save_path = "./model.pt"
# path for your predictions
test_path = "predictions.txt"

# Generate Dataset

In [60]:
# Loading dataset
train_data = ReadDataset("./data/train.csv")
test_data = ReadDataset("./data/test.csv", is_test=True, mean=train_data.mean, std=train_data.std)
len_train = len(train_data)

split_num = [int(len_train*split_ratio), len_train-int(len_train*split_ratio)]
train_data, val_data = random_split(
                            dataset=train_data,
                            lengths=split_num,
                            generator=torch.Generator().manual_seed(myseed)
                        )
print("Num of Samples: Train: {}, Validation: {}, Test: {}".format(len(train_data), len(val_data), len(test_data)))

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

Num of Samples: Train: 13473, Validation: 1498, Test: 9982


# Feature Selection

In [61]:
def get_weights(model, threshold):
    weights = None
    for name, param in model.named_parameters():
        if name == 'fc1.weight':
            # 计算每个输入特征对应的权重的绝对值之和
            # 这里使用.sum(dim=0)是因为我们想要沿着神经元的维度（第0维）进行求和，以得到每个特征的总影响
            weights = param.data.abs().sum(dim=0).numpy()
            print("Feature weights:\n", weights)
            
            selected_idx = (weights >= threshold).nonzero()[0]
            print(f"\nIdx of features whose weight >= {threshold}: \n", selected_idx)
            return selected_idx

In [62]:
# Loss
criterion = nn.BCELoss()
# Input dim
input_dim = train_data[0][0].shape[0]
# Neurons
neurons = 4
# Network
model1 = Net(neurons, [i for i in range(input_dim)])

def train(model, selected_idx = [i for i in range(input_dim)]):
    # your optimizer
    optimizer = torch.optim.Adam(
            params=model.parameters(), lr=lr)

    model.train()
    early_stopper = EarlyStopper(num_trials, save_path)

    # Training
    for epoch in range(epochs):
        for field, label in train_loader:
            field, label = field.float(), label.float()
            prediction = model(field)
            loss = criterion(prediction,label)
            model.zero_grad()
            loss.backward()
            optimizer.step()

        # calculate validation
        val_loss = cal_loss(model, val_loader)
        print("{} Validation Result: {}".format(epoch, val_loss))
        if not early_stopper.is_continuable(model, loss):
            print("Early stop due to no further progress!")
            break
    model.eval()

train(model1)

0 Validation Result: 0.5883728861808777
1 Validation Result: 0.5156374573707581
2 Validation Result: 0.45860040187835693
3 Validation Result: 0.41279885172843933
4 Validation Result: 0.3745429217815399
5 Validation Result: 0.34145355224609375
6 Validation Result: 0.3117210268974304
7 Validation Result: 0.28552597761154175
8 Validation Result: 0.26375657320022583
9 Validation Result: 0.24708686769008636
10 Validation Result: 0.23517261445522308
11 Validation Result: 0.2268182933330536
12 Validation Result: 0.2210795283317566
13 Validation Result: 0.21704356372356415
14 Validation Result: 0.21391940116882324
15 Validation Result: 0.21162909269332886
16 Validation Result: 0.20984815061092377
17 Validation Result: 0.20816455781459808
18 Validation Result: 0.2066025733947754
19 Validation Result: 0.2053178995847702
20 Validation Result: 0.2040400356054306
21 Validation Result: 0.20271989703178406
22 Validation Result: 0.20189280807971954
23 Validation Result: 0.20077143609523773
24 Validati

196 Validation Result: 0.1419270634651184
197 Validation Result: 0.14241449534893036
198 Validation Result: 0.14267860352993011
199 Validation Result: 0.1418779492378235
200 Validation Result: 0.14228446781635284
201 Validation Result: 0.14259833097457886
202 Validation Result: 0.14208294451236725
203 Validation Result: 0.1425117552280426
204 Validation Result: 0.14270010590553284
205 Validation Result: 0.14252346754074097
206 Validation Result: 0.14228621125221252
207 Validation Result: 0.14186996221542358
208 Validation Result: 0.1423337757587433
209 Validation Result: 0.14243237674236298
210 Validation Result: 0.14202982187271118
211 Validation Result: 0.1418735831975937
212 Validation Result: 0.14196471869945526
213 Validation Result: 0.14206120371818542
214 Validation Result: 0.1419169306755066
215 Validation Result: 0.14167235791683197
216 Validation Result: 0.14202794432640076
217 Validation Result: 0.1414388120174408
218 Validation Result: 0.1417425125837326
219 Validation Resu

390 Validation Result: 0.12900106608867645
391 Validation Result: 0.12913714349269867
392 Validation Result: 0.12977546453475952
393 Validation Result: 0.12898963689804077
394 Validation Result: 0.12976732850074768
395 Validation Result: 0.12938860058784485
396 Validation Result: 0.12960754334926605
397 Validation Result: 0.12955354154109955
398 Validation Result: 0.12993180751800537
399 Validation Result: 0.12969954311847687
400 Validation Result: 0.13023421168327332
401 Validation Result: 0.12907999753952026
402 Validation Result: 0.1294180303812027
403 Validation Result: 0.12989385426044464
404 Validation Result: 0.12960302829742432
405 Validation Result: 0.12969009578227997
406 Validation Result: 0.13019293546676636
407 Validation Result: 0.12976385653018951
408 Validation Result: 0.1288929432630539
409 Validation Result: 0.12997809052467346
410 Validation Result: 0.1303202211856842
411 Validation Result: 0.1297144889831543
412 Validation Result: 0.1292973905801773
413 Validation R

In [65]:
selected_idx = get_weights(model1, 0.1)

Feature weights:
 [0.1006928  0.31472355 0.12105007 3.2545652  0.20122623 0.6515373
 0.62022686 0.5897912  0.30635318 0.1980295  0.4654576  0.18892281
 0.27937996 0.19411154 0.1724506  0.41593102 0.64675814 0.4848985
 0.23096222 0.19342428 0.66227347 0.5205168  0.6259389  0.3883751
 0.31856483 4.307088   0.476126   4.7069044  0.3944509  0.4062057
 0.18172903 0.11615664 3.6416502  0.34141952 0.2596449  0.8724227
 0.37165388 0.94406456 6.4143295  3.0019119 ]

Idx of features whose weight >= 0.1: 
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]


# Training

In [66]:
# Neurons
neurons = 4
# Network
model2 = Net(neurons, selected_idx)

train(model2, selected_idx)

# predicting on the test set
test_predict = predict(test_loader, model2)>=0.5
# print("Test predictions: ", ([int(x) for x in test_predict]))

0 Validation Result: 0.7144222855567932
1 Validation Result: 0.6414440870285034
2 Validation Result: 0.5780310034751892
3 Validation Result: 0.5222985744476318
4 Validation Result: 0.47319257259368896
5 Validation Result: 0.43126583099365234
6 Validation Result: 0.39608320593833923
7 Validation Result: 0.3668157756328583
8 Validation Result: 0.3422509729862213
9 Validation Result: 0.32136011123657227
10 Validation Result: 0.3031073808670044
11 Validation Result: 0.28729140758514404
12 Validation Result: 0.2740950286388397
13 Validation Result: 0.26291894912719727
14 Validation Result: 0.25341328978538513
15 Validation Result: 0.24566954374313354
16 Validation Result: 0.23914721608161926
17 Validation Result: 0.23385758697986603
18 Validation Result: 0.22941462695598602
19 Validation Result: 0.22560027241706848
20 Validation Result: 0.2225039005279541
21 Validation Result: 0.21961909532546997
22 Validation Result: 0.21736130118370056
23 Validation Result: 0.21481579542160034
24 Validati

195 Validation Result: 0.12491379678249359
196 Validation Result: 0.12498490512371063
197 Validation Result: 0.12536244094371796
198 Validation Result: 0.1254381388425827
199 Validation Result: 0.12551385164260864
200 Validation Result: 0.1248956099152565
201 Validation Result: 0.1250619739294052
202 Validation Result: 0.12614355981349945
203 Validation Result: 0.1254296898841858
204 Validation Result: 0.12544000148773193
205 Validation Result: 0.12460465729236603
206 Validation Result: 0.1252616047859192
207 Validation Result: 0.1251908391714096
208 Validation Result: 0.12587857246398926
209 Validation Result: 0.12540645897388458
210 Validation Result: 0.12559328973293304
211 Validation Result: 0.12605321407318115
212 Validation Result: 0.12502118945121765
213 Validation Result: 0.1250912845134735
214 Validation Result: 0.12569168210029602
215 Validation Result: 0.12579183280467987
216 Validation Result: 0.12488431483507156
217 Validation Result: 0.12587451934814453
218 Validation Res

387 Validation Result: 0.12651820480823517
388 Validation Result: 0.1258896440267563
389 Validation Result: 0.12636421620845795
390 Validation Result: 0.12598684430122375
391 Validation Result: 0.12596198916435242
392 Validation Result: 0.12563563883304596
393 Validation Result: 0.12659752368927002
394 Validation Result: 0.1257440596818924
395 Validation Result: 0.12635479867458344
396 Validation Result: 0.12671567499637604
397 Validation Result: 0.12564291059970856
398 Validation Result: 0.12621982395648956
399 Validation Result: 0.12602637708187103
400 Validation Result: 0.1258179396390915
401 Validation Result: 0.12605473399162292
402 Validation Result: 0.12632235884666443
403 Validation Result: 0.12656986713409424
404 Validation Result: 0.12604068219661713
405 Validation Result: 0.12521164119243622
406 Validation Result: 0.1262308657169342
407 Validation Result: 0.12572379410266876
408 Validation Result: 0.1266317516565323
409 Validation Result: 0.12601076066493988
410 Validation R

In [67]:
def save_pred(preds, file):
    print('Saving results to {}'.format(file))
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])
save_pred([int(x) for x in test_predict], 'prediction.csv')         # save prediction file to pred.csv

Saving results to prediction.csv


# Hints:

Utilize a New, Powerful Optimizer 

Improve Model Structure

Employ Proper Hyper-Paremeter

Feature Selection

# Rules:

Ensemble models are not allowed.

You may use NumPy or Torch to implement other models such as SVM, but importing other packages is prohibited.
