In [3]:
import numpy as np
import csv
import torch
import torch.nn as nn
import random
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import argparse

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int, default=300)
parser.add_argument('--train_epoch', type=int, default=5)
parser.add_argument('--lr', type=float, default=1e-3, help='learning rate hyperparameter')
parser.add_argument('--verbose', type=int, default=10)
parser.add_argument('--data_path', type=str, default='D:/researches/codes/1.csv')
args = parser.parse_known_args()[0]

In [7]:
def to_float(input):
    if input == '':
        return -1
    else:
        return float(input)

def formator(line):
    result = []
    result.append(to_float(line['satv_use']))
    result.append(to_float(line['satm_use']))
    result.append(to_float(line['satcomp_use']))
    result.append(to_float(line['act_eng']))
    result.append(to_float(line['act_math']))
    result.append(to_float(line['act_read']))
    result.append(to_float(line['act_sci']))
    result.append(to_float(line['act_comp']))
    result.append(to_float(line['nc1']))
    result.append(to_float(line['nc2']))
    result.append(to_float(line['nc3']))
    result.append(to_float(line['ncav']))
    result.append(to_float(line['hs_gpa']))   
    result.append(to_float(line['college_gpa']))
    label = [0,0,0,0]
    if line['vt_adm_dec'][:2] == 'Ad':
        label[0] = 1
    elif line['vt_adm_dec'][:2] == 'De':
        label[1] = 1
    elif line['vt_adm_dec'][:2] == 'Wa':
        label[2] = 1
    else:
        label[3] = 1
    return result, label

In [37]:
with open("df2019.csv", 'r', newline='', encoding='utf-8') as file:
    data = []
    label = []
    reader = csv.DictReader(file)
    for row in reader:
        features, decision = formator(row)
        data.append(features)
        label.append(decision)
    data = np.array(data)
    data = (data - np.min(data,axis=0))/(np.max(data,axis=0) - np.min(data,axis=0) + 1e-6)
    label = np.array(label)

In [38]:
idx = list(range(data.shape[0]))
idx = np.linspace(0,data.shape[0]-1,data.shape[0],dtype=int)
random.shuffle(idx)
train_size = int(data.shape[0]*0.8)
train_idx = idx[:train_size]
test_idx = idx[train_size:]
train_data = data[train_idx]
test_data = data[test_idx]
train_label = label[train_idx]
test_label = label[test_idx]

In [39]:
class Admission_Dataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    def __len__(self):
        return self.data.shape[0]

train_dataset = Admission_Dataset(train_data,train_label)
test_dataset = Admission_Dataset(test_data,test_label)
train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False)

In [40]:
def test(model):
    correct_count = 0
    for batch in test_loader:
        data, label = batch
        preds = model(data.float())
        preds = torch.argmax(preds, dim=1)
        label = torch.argmax(label, dim=1)
        correct_count += torch.sum(preds==label)
    return correct_count/test_data.shape[0]


In [41]:
def train(model):
    optimizer = torch.optim.Adam(model.parameters(), args.lr)
    criterion = nn.CrossEntropyLoss()

    for i in tqdm(range(args.train_epoch)):
        for j, batch in enumerate(train_loader):
            data, label = batch
            preds = model(data.float())
            optimizer.zero_grad()
            loss = criterion(preds, label.float())
            loss.backward()
            optimizer.step()
            if j%args.verbose == 0:
                print('train epoch {} epoch {} loss: {}'.format(i,j,loss.item()))
        print('train epoch {} classification accuracy: {}:'.format(i,test(model)))

def save_rep(model):
    model.save_rep(torch.tensor(data).float())
    np.save('label.npy', label)

In [42]:
class MLP(torch.nn.Module):
    def __init__(self, latent_dim = 64, input_length=14, class_num=4):
        super().__init__()
        self.lin1 = nn.Linear(input_length,latent_dim)
        self.lin2 = nn.Linear(latent_dim,latent_dim)
        self.lin3 = nn.Linear(latent_dim,class_num)

    def forward(self,x):
        x = self.lin1(x)
        x = torch.nn.GELU()(x)
        x = self.lin2(x)
        x = torch.nn.GELU()(x)
        x = self.lin3(x)
        return x
    
    def save_rep(self, x):
        np.save('original_rep.npy', x.detach().numpy())
        x = self.lin1(x)
        np.save('layer1_rep.npy', x.detach().numpy())
        x = torch.nn.GELU()(x)
        x = self.lin2(x)
        np.save('layer2_rep.npy', x.detach().numpy())

    

In [43]:
model = MLP()
train(model)
save_rep(model)

  0%|          | 0/5 [00:00<?, ?it/s]

train epoch 0 epoch 0 loss: 1.3423652648925781
train epoch 0 epoch 10 loss: 1.2181541919708252
train epoch 0 epoch 20 loss: 1.0684549808502197
train epoch 0 epoch 30 loss: 0.9459431171417236
train epoch 0 epoch 40 loss: 1.0208693742752075
train epoch 0 epoch 50 loss: 0.8968073725700378
train epoch 0 epoch 60 loss: 0.9104130864143372
train epoch 0 epoch 70 loss: 0.9260752201080322
train epoch 0 epoch 80 loss: 0.8536872267723083


 20%|██        | 1/5 [00:00<00:01,  3.73it/s]

train epoch 0 epoch 90 loss: 0.9041756391525269
train epoch 0 classification accuracy: 0.6908879280090332:
train epoch 1 epoch 0 loss: 0.9103702902793884
train epoch 1 epoch 10 loss: 0.8606123924255371
train epoch 1 epoch 20 loss: 0.8015207052230835
train epoch 1 epoch 30 loss: 0.8587706685066223
train epoch 1 epoch 40 loss: 0.9454024434089661
train epoch 1 epoch 50 loss: 0.8378568291664124
train epoch 1 epoch 60 loss: 0.8629657030105591


 40%|████      | 2/5 [00:00<00:00,  3.76it/s]

train epoch 1 epoch 70 loss: 0.8810014128684998
train epoch 1 epoch 80 loss: 0.8125078678131104
train epoch 1 epoch 90 loss: 0.8638734817504883
train epoch 1 classification accuracy: 0.68885338306427:
train epoch 2 epoch 0 loss: 0.8336718082427979
train epoch 2 epoch 10 loss: 0.779705822467804
train epoch 2 epoch 20 loss: 0.7451146245002747
train epoch 2 epoch 30 loss: 0.7965748310089111
train epoch 2 epoch 40 loss: 0.871676504611969


 60%|██████    | 3/5 [00:00<00:00,  3.77it/s]

train epoch 2 epoch 50 loss: 0.7683835625648499
train epoch 2 epoch 60 loss: 0.8257764577865601
train epoch 2 epoch 70 loss: 0.8513416051864624
train epoch 2 epoch 80 loss: 0.7946775555610657
train epoch 2 epoch 90 loss: 0.8348208069801331
train epoch 2 classification accuracy: 0.7035314440727234:
train epoch 3 epoch 0 loss: 0.7990593314170837
train epoch 3 epoch 10 loss: 0.7425222992897034
train epoch 3 epoch 20 loss: 0.729369580745697


 80%|████████  | 4/5 [00:01<00:00,  3.77it/s]

train epoch 3 epoch 30 loss: 0.7732084393501282
train epoch 3 epoch 40 loss: 0.8468686938285828
train epoch 3 epoch 50 loss: 0.7529251575469971
train epoch 3 epoch 60 loss: 0.8167449831962585
train epoch 3 epoch 70 loss: 0.8371982574462891
train epoch 3 epoch 80 loss: 0.776539146900177
train epoch 3 epoch 90 loss: 0.8226838111877441
train epoch 3 classification accuracy: 0.7073099613189697:
train epoch 4 epoch 0 loss: 0.7863752245903015
train epoch 4 epoch 10 loss: 0.7287291288375854
train epoch 4 epoch 20 loss: 0.7183182239532471
train epoch 4 epoch 30 loss: 0.7602121829986572
train epoch 4 epoch 40 loss: 0.8301041722297668
train epoch 4 epoch 50 loss: 0.7440239191055298
train epoch 4 epoch 60 loss: 0.8117803335189819
train epoch 4 epoch 70 loss: 0.8269037008285522
train epoch 4 epoch 80 loss: 0.7614269256591797
train epoch 4 epoch 90 loss: 0.8121156692504883


100%|██████████| 5/5 [00:01<00:00,  3.77it/s]

train epoch 4 classification accuracy: 0.7084726095199585:



