In [1]:
import json

combined_data = {}
uncategorized_data = json.load(open('../../scraping/combined_collegeresults_data.json', 'r'))
standardized_output = json.load(open('../../categorization/standardized_output.json', 'r'))

for i, row in enumerate(open('../../categorization/categorized.json', 'r').readlines()):
    numerical_data = json.loads(row)

    post_id = list(uncategorized_data.keys())[i]
    try:
        selected_data = [
            numerical_data['basic_info']['ethnicity'],
            numerical_data['basic_info']['gender'],
            numerical_data['basic_info']['income_bracket'],
            numerical_data['basic_info']['gpa'],
            numerical_data['basic_info']['ap_ib_courses'],
            numerical_data['basic_info']['ap_ib_scores'],
            numerical_data['basic_info']['test_score'],
            numerical_data['basic_info']['location'],
            numerical_data['basic_info']['legacy'],
            numerical_data['basic_info']['first_gen'],
            numerical_data['basic_info']['languages'],
            numerical_data['basic_info']['special_talents'],
            numerical_data['basic_info']['hooks'],
        ] + list(numerical_data['ecs'].values()) + list(numerical_data['awards'].values())
        results = standardized_output[post_id]['results']
        combined_data.update({post_id:{
            'major': standardized_output[post_id]['major'],
            'residence': standardized_output[post_id]['residence'],
            'extracurriculars': standardized_output[post_id]['extracurriculars'],
            'awards': standardized_output[post_id]['awards'],
            'numerical': selected_data,
            'results': results
        }})
    except KeyError:
        continue

In [2]:
import torch
import nltk
from combined_regressor import CombinedResultRegressor, CombinedCollegeResultsDataset

nltk.download('stopwords')
nltk_stopwords = set(nltk.corpus.stopwords.words('english'))

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
colleges_list = open('../../categorization/all-colleges.txt').readlines()
colleges_list = [college[:college.index(' (')] for college in colleges_list]

dataset =  CombinedCollegeResultsDataset(combined_data, colleges_list, nltk_stopwords)

In [None]:
full_data_size = len(dataset)
train_size = int(full_data_size * 0.8)
print(f"Train Data Size: {train_size}")

batch_size = 128
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, full_data_size - train_size])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
device

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, n_epochs, device):
    best_val_acc = 0
    for epoch in range(n_epochs):
        for param_group in optimizer.param_groups:
            print(f"Current learning rate: {param_group['lr']}")

        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            for k, v in batch.items():
                batch[k] = v.to(device)

            outputs = model(batch)
            loss = criterion(outputs, batch['target'])

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_loss += loss.item()
            train_total += batch['target'].size(0)
            train_correct += ((outputs > 0.5) == batch['target']).sum().item()

        train_loss /= len(train_loader)
        train_acc = train_correct / train_total

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                for k, v in batch.items():
                    batch[k] = v.to(device)

                outputs = model(batch)
                loss = criterion(outputs, batch['target'])

                val_loss += loss.item()
                val_total += batch['target'].size(0)
                val_correct += ((outputs > 0.5) == batch['target']).sum().item()

            print(outputs)

        val_loss /= len(val_loader)
        val_acc = val_correct / val_total

        scheduler.step(val_loss)

        print(f'Epoch {epoch+1}/{n_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'tokenize_numerical.pt')

    return model

In [None]:
model = CombinedResultRegressor().to(device)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, min_lr=1e-6)

n_epochs = 200
model = train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, n_epochs, device)