In [1]:
import torch
import nltk
import json
import torch

nltk.download('stopwords')
nltk_stopwords = set(nltk.corpus.stopwords.words('english'))

colleges_list = open('all-colleges.txt').readlines()
colleges_list = [college[:college.index(' (')] for college in colleges_list]

try:
    data = json.load(open('standardized_output.json', 'r'))
except:
    data = json.load(open('output_2.json'))
    for post in data.values():
        drop = []
        for i in range(len(post['results'])):
            college = post['results'][i]
            closest_name = difflib.get_close_matches(college['school_name'], colleges_list, n=1, cutoff=0.8)
            if closest_name:
                college['school_name'] = closest_name[0]
            else:
                drop.append(i)
        for index in reversed(drop):
            post['results'].pop(index)

    json.dump(data, open('standardized_output.json', 'w'))

shortened_data = {}

for post_id in data.keys():
    if post_id >= '189wc0k':
        shortened_data.update({post_id: data[post_id]})

data = shortened_data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
device

device(type='mps')

In [3]:
from regressor import ResultRegressor, CollegeResultsDataset

split_index = int(0.8 * len(data))
train_data = dict(list(data.items())[:split_index])
test_data = dict(list(data.items())[split_index:])
train_data_size = sum(len(post['results']) for post in train_data.values())
print(f"Train Data Size: {train_data_size}")

train_dataset = CollegeResultsDataset(train_data, colleges_list, nltk_stopwords)
test_dataset = CollegeResultsDataset(test_data, colleges_list, nltk_stopwords)

batch_size = 10
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

  from .autonotebook import tqdm as notebook_tqdm


Train Data Size: 11040


In [4]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, n_epochs, device):
    for epoch in range(n_epochs):
        for param_group in optimizer.param_groups:
            print(f"Current learning rate: {param_group['lr']}")

        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            for k, v in batch.items():
                batch[k] = v.to(device)

            outputs = model(batch)
            loss = criterion(outputs, batch['target'])

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            print(f"Batch: {i}, Loss: {round(loss.item(), 2)}")

            train_loss += loss.item()
            train_total += batch['target'].size(0)
            train_correct += ((outputs > 0.5) == batch['target']).sum().item()

        train_loss /= len(train_loader)
        train_acc = train_correct / train_total

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                for k, v in batch.items():
                    batch[k] = v.to(device)

                outputs = model(batch)
                loss = criterion(outputs, batch['target'])

                val_loss += loss.item()
                val_total += batch['target'].size(0)
                val_correct += ((outputs > 0.5) == batch['target']).sum().item()

                print(outputs)

        val_loss /= len(val_loader)
        val_acc = val_correct / val_total
        scheduler.step()

        print(f'Epoch {epoch+1}/{n_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        torch.save(model.state_dict(), '2023_24_regression.pth')

    return model

In [5]:
model = ResultRegressor().to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000000, gamma=1.0)

n_epochs = 50
model = train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, n_epochs, device)

Current learning rate: 0.0005


  return torch.tensor(self.model(**inputs)['last_hidden_state'][0][0], dtype=torch.float32)
  return torch.tensor(self.model(**inputs)['last_hidden_state'][0][0], dtype=torch.float32)


torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 0, Loss: 0.66
torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 1, Loss: 0.74
torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 2, Loss: 0.64
torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 3, Loss: 0.78
torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 4, Loss: 0.74
torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 5, Loss: 0.8
torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 6, Loss: 0.58
torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 7, Loss: 0.58
torch.Size([10, 50]) torch.Size([10, 50]) torch.Size([10, 300]) torch.Size([10, 500])
Batch: 8, Loss: 0.78
torch.Size([10, 50]) torch.Size([10, 5

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), '2023_24_regression.pth')
