In [None]:
from transformers import DistilBertModel, DistilBertTokenizer
import nltk
import torch
import difflib
import json
import numpy as np
import time

In [None]:
nltk.download('stopwords')
nltk_stopwords = set(nltk.corpus.stopwords.words('english'))
nltk_stopwords

In [None]:
colleges_list = open('../categorization/all-colleges.txt').readlines()
colleges_list = [college[:college.index(' (')] for college in colleges_list]

try:
    data = json.load(open('../categorization/standardized_output.json', 'r'))
except:
    data = json.load(open('../categorization/output_2.json'))

    for post in data.values():
        drop = []
        for i in range(len(post['results'])):
            college = post['results'][i]
            closest_name = difflib.get_close_matches(college['school_name'], colleges_list, n=1, cutoff=0.8)
            if closest_name:
                college['school_name'] = closest_name[0]
            else:
                drop.append(i)
        for index in reversed(drop):
            post['results'].pop(index)

    json.dump(data, open('../categorization/standardized_output.json', 'w'))

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
device

In [None]:
class ResultRegressor(torch.nn.Module):
    def __init__(self, stopwords):
        super(ResultRegressor, self).__init__()
        self.stopwords = stopwords
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True)
        
        self.text1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text2 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text3 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text4 = DistilBertModel.from_pretrained("distilbert-base-uncased")

        self.pc1 = torch.nn.Linear(768, 768)
        self.pc2 = torch.nn.Linear(768, 768)
        self.pc3 = torch.nn.Linear(768, 768)
        self.pc4 = torch.nn.Linear(768, 768)
        
        self.fc1 = torch.nn.Linear(3082, 1541)
        self.fc2 = torch.nn.Linear(1541, 512)
        self.fc3 = torch.nn.Linear(512, 128)
        self.fc4 = torch.nn.Linear(128, 32)
        self.fc5 = torch.nn.Linear(32, 1)

        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=0)
        self.dropout = torch.nn.Dropout(0.5)

    def ts(self, input):
        return torch.tensor(input, dtype=torch.long).to(device)
    
    def remove_stopwords(self, text):
        filtered_text = [w for w in text.split() if w.lower() not in self.stopwords]
        return " ".join(filtered_text)

    def tokenize(self, input, max_length=512):
        tokenized = self.tokenizer.encode_plus(
            input,
            None,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        return torch.tensor(tokenized['input_ids'], dtype=torch.long), torch.tensor(tokenized['attention_mask'], dtype=torch.long)

    def forward(self, x, results, college_id):
        numerical_inputs = self.ts([int(x['ethnicity']),
                            int(x['gender']),
                            int(x['income_bracket']),
                            int(x['gpa']),
                            int(x['apib_number']),
                            int(x['apib_scores']),
                            int(x['standardized_test_scores']),
                            int(results['in_state']),
                            int(results['round']),
                            college_id])
        
        major_ids, major_masks = self.tokenize(self.remove_stopwords(x['major']))
        residence_ids, residence_masks = self.tokenize(self.remove_stopwords(x['residence']))
        extracurricular_ids, extracurricular_masks =  self.tokenize(self.remove_stopwords('\n'.join(x['extracurriculars'])))
        awards_ids, awards_masks =  self.tokenize(self.remove_stopwords('\n'.join(x['awards'])))

        major_pooler = self.text1(self.ts(major_ids), self.ts(major_masks))[0][:,0]
        residence_pooler = self.text2(self.ts(residence_ids), self.ts(residence_masks))[0][:,0]
        extraccurricular_pooler = self.text3(self.ts(extracurricular_ids), self.ts(extracurricular_masks))[0][:,0]
        awards_pooler = self.text4(self.ts(awards_ids), self.ts(awards_masks))[0][:,0]
        
        numerical_inputs = torch.cat([numerical_inputs,
                                      self.dropout(self.relu(self.pc1(major_pooler)))[0],
                                      self.dropout(self.relu(self.pc2(residence_pooler)))[0],
                                      self.dropout(self.relu(self.pc3(extraccurricular_pooler)))[0],
                                      self.dropout(self.relu(self.pc4(awards_pooler)))[0]])


        x = self.fc1(numerical_inputs)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)

        x = self.softmax(x)[0]

        return self.dropout(x)

In [None]:
model = ResultRegressor().to(device)

In [None]:
n_epochs = 10
total_training_start_time = time.time()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
criterion

In [None]:
split_index = int(0.8 * len(data))

train_data = dict(list(data.items())[:split_index])
test_data = dict(list(data.items())[split_index:])

total_count = sum(len(post['results']) for post in train_data.values())

In [None]:
i = 0

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    epoch_start_time = time.time()
    
    for post in train_data.values():
        for college in post['results']:
            i += 1

            college_id = colleges_list.index(college['school_name'])
            predicted = model(post, college, college_id)
            target = torch.tensor(college['accepted']).to(device)

            loss = criterion(predicted, target)
            loss.backward()

            print(f'Predicted: {predicted}, Target: {target}, Loss: {loss.item()}, Data: {i}/{total_count}')

            optimizer.step()
            
            running_loss += loss.item()
            total += 1
            correct += 1 if predicted == target else 0

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_start_time
    print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_data):.4f}, Accuracy: {correct/total:.4f}, Time: {epoch_time:.2f} seconds')