In [1]:
import json
import random

combined_data = {}
uncategorized_data = json.load(open('../../scraping/combined_collegeresults_data.json', 'r'))
standardized_output = json.load(open('../../categorization/standardized_output.json', 'r'))

for i, row in enumerate(open('../../categorization/categorized.json', 'r').readlines()):
    numerical_data = json.loads(row)
    
    try:
        post_id = list(uncategorized_data.keys())[i]
        other_data = standardized_output[post_id]
    except KeyError:
        other_data = {
            'major': random.randint(0,11),
            'residence': ''
        }
    try:
        selected_data = [
            numerical_data['basic_info']['ethnicity'],
            numerical_data['basic_info']['gender'],
            numerical_data['basic_info']['income_bracket'],
            numerical_data['basic_info']['gpa'],
            numerical_data['basic_info']['ap_ib_courses'],
            numerical_data['basic_info']['ap_ib_scores'],
            numerical_data['basic_info']['test_score'],
            numerical_data['basic_info']['location'],
            numerical_data['basic_info']['legacy'],
            numerical_data['basic_info']['first_gen'],
            numerical_data['basic_info']['languages'],
            numerical_data['basic_info']['special_talents'],
            numerical_data['basic_info']['hooks'],
        ] + list(numerical_data['ecs'].values()) + list(numerical_data['awards'].values())
        results = other_data['results']
        combined_data.update({post_id:{
            'major':other_data['major'],
            'residence':other_data['residence'],
            'ecs':other_data['extracurriculars'],
            'awards':other_data['awards'],
            'numeric':selected_data,
            'results':results
        }})
    except KeyError:
        continue

In [2]:
len(combined_data)

2981

In [3]:
import torch
from numerical_regressor import NumericResultRegressor, NumericCollegeResultsDataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
colleges_list = open('../../categorization/all-colleges.txt').readlines()
colleges_list = [college[:college.index(' (')] for college in colleges_list]

dataset =  NumericCollegeResultsDataset(combined_data, colleges_list)

Loaded 1dxphiy
Loaded rtpx67
Loaded rum7il
Loaded rutvbd
Loaded rv22b0
Loaded rvpj6b
Loaded rwauee
Loaded rx0kcu
Loaded rx8bh8
Loaded rxtl0z
Loaded rxz5l8
Loaded ry18c0
Loaded ryefjn
Loaded rzkaqv
Loaded s0r0sz
Loaded s1nd65
Loaded s1ug1a
Loaded s20rmr
Loaded s2ippg
Loaded s2wj6v
Loaded s32u9m
Loaded s3s9jg
Loaded s41kh6
Loaded s47sbx
Loaded s486ai
Loaded s54dlv
Loaded s5i6ow
Loaded s5rqfo
Loaded s6gnjl
Loaded s6jc14
Loaded s6umba
Loaded s7uugn
Loaded s86h8p
Loaded s92qsm
Loaded sb825d
Loaded sck0xo
Loaded sco7eb
Loaded sef93c
Loaded seixyr
Loaded sf4a5l
Loaded sf5ri6
Loaded sf5vnb
Loaded sf6hf1
Loaded sf6lwv
Loaded sf7exq
Loaded sf7jy5
Loaded sfp7z9
Loaded sfpjx1
Loaded sftcuh
Loaded sfvdl0
Loaded sfvefq
Loaded sfvipc
Loaded sfyu7t
Loaded sg2oio
Loaded sg3aaf
Loaded sg3isi
Loaded sg6fna
Loaded sg70ja
Loaded sgdqic
Loaded sgfh0w
Loaded sgfxli
Loaded sgnn4t
Loaded sgso79
Loaded sgtdoq
Loaded sh0zr7
Loaded shbuqj
Loaded shedl8
Loaded shhm8z
Loaded shi545
Loaded shijtm
Loaded shlw76
Loade

In [5]:
full_data_size = len(dataset)
train_size = int(full_data_size * 0.8)
print(f"Train Data Size: {train_size}")

batch_size = 32
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, full_data_size - train_size])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

Train Data Size: 17601


In [6]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
device

device(type='mps')

In [7]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, n_epochs, device):
    best_val_acc = 0
    for epoch in range(n_epochs):
        for param_group in optimizer.param_groups:
            print(f"Current learning rate: {param_group['lr']}")

        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            for k, v in batch.items():
                batch[k] = v.to(device)

            outputs = model(batch)
            loss = criterion(outputs, batch['target'])

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_loss += loss.item()
            train_total += batch['target'].size(0)
            train_correct += ((outputs > 0.5) == batch['target']).sum().item()

        train_loss /= len(train_loader)
        train_acc = train_correct / train_total

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                for k, v in batch.items():
                    batch[k] = v.to(device)

                outputs = model(batch)
                loss = criterion(outputs, batch['target'])

                val_loss += loss.item()
                val_total += batch['target'].size(0)
                val_correct += ((outputs > 0.5) == batch['target']).sum().item()

            print(outputs)

        val_loss /= len(val_loader)
        val_acc = val_correct / val_total

        scheduler.step(val_loss)

        print(f'Epoch {epoch+1}/{n_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'tokenize_numerical.pt')

    return model

In [8]:
model = NumericResultRegressor().to(device)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, min_lr=1e-6)

n_epochs = 200
model = train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, n_epochs, device)

Current learning rate: 0.0005
tensor([0.5147, 0.5166, 0.4972, 0.4788, 0.5095, 0.4509, 0.5063, 0.4950, 0.5060,
        0.5459, 0.4819, 0.4958, 0.5137, 0.5244, 0.5407, 0.5184, 0.5429],
       device='mps:0')
Epoch 1/200:
Train Loss: 0.6670, Train Acc: 0.4442
Val Loss: 0.6660, Val Acc: 0.5862
Current learning rate: 0.0005
tensor([0.6440, 0.6424, 0.6659, 0.7010, 0.2324, 0.1981, 0.6294, 0.4688, 0.4365,
        0.4262, 0.4403, 0.5606, 0.2958, 0.3335, 0.5985, 0.1683, 0.5896],
       device='mps:0')
Epoch 2/200:
Train Loss: 0.6622, Train Acc: 0.4679
Val Loss: 0.6500, Val Acc: 0.6071
Current learning rate: 0.0005
tensor([0.2190, 0.7760, 0.2956, 0.6808, 0.7411, 0.4575, 0.7925, 0.2275, 0.2890,
        0.5345, 0.2583, 0.2472, 0.4097, 0.1885, 0.2390, 0.3665, 0.3642],
       device='mps:0')
Epoch 3/200:
Train Loss: 0.6569, Train Acc: 0.4834
Val Loss: 0.6519, Val Acc: 0.5306
Current learning rate: 0.0005
tensor([0.0835, 0.8325, 0.7903, 0.8513, 0.0839, 0.1003, 0.7580, 0.7087, 0.8222,
        0.4019, 0

KeyboardInterrupt: 