In [1]:
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig
from torch.utils.data import DataLoader
import nltk
import torch
import difflib
import json
import numpy as np
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')
nltk_stopwords = set(nltk.corpus.stopwords.words('english'))
nltk_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [3]:
colleges_list = open('../categorization/all-colleges.txt').readlines()
colleges_list = [college[:college.index(' (')] for college in colleges_list]

try:
    data = json.load(open('../categorization/standardized_output.json', 'r'))
except:
    data = json.load(open('../categorization/output_2.json'))

    for post in data.values():
        drop = []
        for i in range(len(post['results'])):
            college = post['results'][i]
            closest_name = difflib.get_close_matches(college['school_name'], colleges_list, n=1, cutoff=0.8)
            if closest_name:
                college['school_name'] = closest_name[0]
            else:
                drop.append(i)
        for index in reversed(drop):
            post['results'].pop(index)

    json.dump(data, open('../categorization/standardized_output.json', 'w'))

In [4]:
shortened_data = {}

for post_id in data.keys():
    if post_id >= '189wc0k':
        shortened_data.update({post_id: data[post_id]})

data = shortened_data

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
device

device(type='mps')

In [6]:
class ResultRegressor(torch.nn.Module):
    def __init__(self, stopwords):
        super(ResultRegressor, self).__init__()
        self.stopwords = stopwords
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, padding=True)
        
        self.text1 = DistilBertModel.from_pretrained("distilbert-base-uncased",
                                                     ignore_mismatched_sizes=True,
                                                     config = DistilBertConfig(max_position_embeddings=20, 
                                                                               dropout=0.5))
        self.text2 = DistilBertModel.from_pretrained("distilbert-base-uncased", 
                                                     ignore_mismatched_sizes=True,
                                                     config = DistilBertConfig(max_position_embeddings=10, 
                                                                               dropout=0.5))
        self.text3 = DistilBertModel.from_pretrained("distilbert-base-uncased", 
                                                     ignore_mismatched_sizes=True,
                                                     config = DistilBertConfig(max_position_embeddings=512, 
                                                                               dropout=0.5))
        
        self.pc1 = torch.nn.Linear(768, 128)
        self.pc2 = torch.nn.Linear(768, 128)
        self.pc3 = torch.nn.Linear(768, 128)
        
        self.fc1 = torch.nn.Linear(394, 32)
        self.fc2 = torch.nn.Linear(32, 1)

        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=0)
        self.dropout = torch.nn.Dropout(0.5)

    def ts(self, input):
        return torch.tensor(input, dtype=torch.long).to(device)
    
    def remove_stopwords(self, text):
        filtered_text = [w for w in text.split() if w.lower() not in self.stopwords]
        return " ".join(filtered_text)

    def tokenize(self, input, max_length=512):
        tokenized = self.tokenizer.encode_plus(
            input,
            None,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        return torch.tensor(tokenized['input_ids'], dtype=torch.long), torch.tensor(tokenized['attention_mask'], dtype=torch.long)

    def forward(self, x, results, college_id):
        numerical_inputs = self.ts([int(x['ethnicity']),
                            int(x['gender']),
                            int(x['income_bracket']),
                            int(x['gpa']),
                            int(x['apib_number']),
                            int(x['apib_scores']),
                            int(x['standardized_test_scores']),
                            int(results['in_state']),
                            int(results['round']),
                            college_id])
        
        major_ids, major_masks = self.tokenize(self.remove_stopwords(x['major']), max_length=20)
        residence_ids, residence_masks = self.tokenize(self.remove_stopwords(x['residence']), max_length=10)
        extracurricular_ids, extracurricular_masks =  self.tokenize(self.remove_stopwords('\n'.join(x['extracurriculars'] + x['awards'])))

        major_pooler = self.text1(self.ts(major_ids), self.ts(major_masks))[0][:,0]
        residence_pooler = self.text2(self.ts(residence_ids), self.ts(residence_masks))[0][:,0]
        extraccurricular_pooler = self.text3(self.ts(extracurricular_ids), self.ts(extracurricular_masks))[0][:,0]
        
        numerical_inputs = torch.cat([numerical_inputs,
                                      self.relu(self.pc1(major_pooler))[0],
                                      self.relu(self.pc2(residence_pooler))[0],
                                      self.relu(self.pc3(extraccurricular_pooler))[0]])


        x = self.fc1(numerical_inputs)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return self.softmax(x)[0].type(torch.float32)

In [7]:
model = ResultRegressor(stopwords=nltk_stopwords).to(device)

Some weights of DistilBertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized because the shapes did not match:
- distilbert.embeddings.position_embeddings.weight: found shape torch.Size([512, 768]) in the checkpoint and torch.Size([20, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized because the shapes did not match:
- distilbert.embeddings.position_embeddings.weight: found shape torch.Size([512, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
n_epochs = 10
total_training_start_time = time.time()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [9]:
criterion

BCELoss()

In [10]:
# dataset = []

# for post in data.values():
#     for college in post['results']:
#         inputs = [int(post['ethnicity']),
#                   int(post['gender']),
#                   int(post['income_bracket']),
#                   int(post['gpa']),
#                   int(post['apib_number']),
#                   int(post['apib_scores']),
#                   int(post['standardized_test_scores']),
#                   int(college['in_state']),
#                   int(college['round']),
#                   colleges_list.index(college['school_name']),
#                   post['major'],
#                   post['residence'],
#                   '\n'.join(post['extracurriculars'] + post['awards'])]
#         y = college['accepted']

#         dataset.append((inputs, y))

In [11]:
split_index = int(0.8 * len(data))

train_data = dict(list(data.items())[:split_index])
test_data = dict(list(data.items())[split_index:])

total_count = sum(len(post['results']) for post in train_data.values())

In [12]:
total_count

11040

In [13]:
i = 0

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    epoch_start_time = time.time()
    
    for post in train_data.values():
        for college in post['results']:
            i += 1

            college_id = colleges_list.index(college['school_name'])
            predicted = model(post, college, college_id)
            target = torch.tensor(college['accepted'], dtype=torch.float32).to(device)

            loss = criterion(predicted, target)
            loss.backward()

            print(f'Predicted: {predicted}, Target: {target}, Loss: {loss.item()}, Data: {i}/{total_count}')

            optimizer.step()
            
            running_loss += loss.item()
            total += 1
            correct += 1 if predicted == target else 0

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_start_time
    print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_data):.4f}, Accuracy: {correct/total:.4f}, Time: {epoch_time:.2f} seconds')

  return torch.tensor(input, dtype=torch.long).to(device)


Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 1/11040
Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 2/11040
Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 3/11040
Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 4/11040
Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 5/11040
Predicted: 1.0, Target: 0.0, Loss: 100.0, Data: 6/11040
Predicted: 1.0, Target: 0.0, Loss: 100.0, Data: 7/11040
Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 8/11040
Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 9/11040
Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 10/11040
Predicted: 1.0, Target: 1.0, Loss: -0.0, Data: 11/11040


In [None]:
torch.save(model.state_dict(), 'categorization/2023_24_regression.pth')