In [1]:
import torch
from torch.utils.data import DataLoader
import nltk
import json

nltk.download('stopwords')
nltk_stopwords = set(nltk.corpus.stopwords.words('english'))
nltk_stopwords

colleges_list = open('../categorization/all-colleges.txt').readlines()
colleges_list = [college[:college.index(' (')] for college in colleges_list]

try:
    data = json.load(open('../categorization/standardized_output.json', 'r'))
except:
    data = json.load(open('../categorization/output_2.json'))

    for post in data.values():
        drop = []
        for i in range(len(post['results'])):
            college = post['results'][i]
            closest_name = difflib.get_close_matches(college['school_name'], colleges_list, n=1, cutoff=0.8)
            if closest_name:
                college['school_name'] = closest_name[0]
            else:
                drop.append(i)
        for index in reversed(drop):
            post['results'].pop(index)

    json.dump(data, open('../categorization/standardized_output.json', 'w'))

shortened_data = {}

for post_id in data.keys():
    if post_id >= '189wc0k':
        shortened_data.update({post_id: data[post_id]})

data = shortened_data

split_index = int(0.8 * len(data))

train_data = dict(list(data.items())[:split_index])
test_data = dict(list(data.items())[split_index:])

total_count = sum(len(post['results']) for post in train_data.values())

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

from regressor import CollegeDataset, ResultRegressor

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def train_model(model, train_loader, val_loader, criterion, optimizer, n_epochs, device):
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        i = 0
        for batch in train_loader:
            i += 1
            optimizer.zero_grad()
            
            for k, v in batch.items():
                batch[k] = v.to(device)
            
            outputs = model(batch)
            loss = criterion(outputs, batch['target'])
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_total += batch['target'].size(0)
            train_correct += ((outputs > 0.5) == batch['target']).sum().item()

            print(f"{i}, loss: {round(loss.item(),3)}, accuracy: {round(1.0*((outputs > 0.5) == batch['target']).sum().item()/len(batch['target']),3)}")
        
        train_loss /= len(train_loader)
        train_acc = train_correct / train_total
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch in val_loader:
                for k, v in batch.items():
                    batch[k] = v.to(device)
                
                outputs = model(batch)
                loss = criterion(outputs, batch['target'])
                
                val_loss += loss.item()
                val_total += batch['target'].size(0)
                val_correct += ((outputs > 0.5) == batch['target']).sum().item()
        
        val_loss /= len(val_loader)
        val_acc = val_correct / val_total
        
        print(f'Epoch {epoch+1}/{n_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        torch.save(model.state_dict(), '../categorization/2023_24_regression.pth')
    
    return model

In [3]:
# Prepare datasets and dataloaders
train_dataset = CollegeDataset(train_data, colleges_list, nltk_stopwords)
test_dataset = CollegeDataset(test_data, colleges_list, nltk_stopwords)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=2)

# Initialize model, criterion, and optimizer
model = ResultRegressor().to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Train the model
n_epochs = 10
model = train_model(model, train_loader, test_loader, criterion, optimizer, n_epochs, device)

Some weights of DistilBertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized because the shapes did not match:
- distilbert.embeddings.position_embeddings.weight: found shape torch.Size([512, 768]) in the checkpoint and torch.Size([20, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized because the shapes did not match:
- distilbert.embeddings.position_embeddings.weight: found shape torch.Size([512, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
torch.save(model.state_dict(), '../categorization/2023_24_regression.pth')