In [6]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer


In [7]:
sample_json = {}

In [10]:
class ResultRegressor(torch.nn.Module):
    def __init__(self):
        super(ResultRegressor, self).__init__()
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        
        self.text1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text2 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text3 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text4 = DistilBertModel.from_pretrained("distilbert-base-uncased")

        self.pc1 = torch.nn.Linear(768, 768)
        self.pc2 = torch.nn.Linear(768, 768)
        self.pc3 = torch.nn.Linear(768, 768)
        self.pc4 = torch.nn.Linear(768, 768)
        
        self.fc1 = torch.nn.Linear(10 + 768 * 4, (10 + 768 * 4)/2)
        self.fc2 = torch.nn.Linear((10 + 768 * 4)/2, 512)
        self.fc3 = torch.nn.Linear(512, 128)
        self.fc4 = torch.nn.Linear(128, 32)
        self.fc5 = torch.nn.Linear(32, 1)

        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax()
        self.dropout = torch.nn.Dropout(0.5)

    def tokenize(self, input, max_length=3000):
        tokenized = self.tokenizer.encode(
            input,
            None,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        return tokenized['input_ids'], tokenized['attention_mask']

    def forward(self, x, results, college_id):
        numerical_inputs = [int(x['ethnicity']),
                            int(x['gender']),
                            int(x['income_bracket']),
                            int(x['gpa']),
                            int(x['apib_number']),
                            int(x['apib_scores']),
                            int(x['standardized_test_scores']),
                            int(results['in_state']),
                            int(results['round']),
                            college_id]
        
        major_ids, major_masks = self.tokenize(x['major'], max_length=280)
        residence_ids, residence_masks = self.tokenize(x['residence'], max_length=280)
        extracurricular_ids, extracurricular_masks =  self.tokenize('\n'.join(x['extracurriculars']))
        awards_ids, awards_masks =  self.tokenize('\n'.join(x['awards']))
        
        major_pooler = self.text1(major_ids, major_masks)[0][:,0]
        residence_pooler = self.text2(residence_ids, residence_masks)[0][:,0]
        extraccurricular_pooler = self.text3(extracurricular_ids, extracurricular_masks)[0][:,0]
        awards_pooler = self.text4(awards_ids, awards_masks)[0][:,0]

        numerical_inputs += self.dropout(self.relu(self.pc1(major_pooler)))
        numerical_inputs += self.dropout(self.relu(self.pc2(residence_pooler)))
        numerical_inputs += self.dropout(self.relu(self.pc3(extraccurricular_pooler)))
        numerical_inputs += self.dropout(self.relu(self.pc4(awards_pooler)))

        x = self.fc1(numerical_inputs)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)
        x = self.softmax(x)
        return self.dropout(x)