In [53]:
import pandas as pd
import csv
import torch
import copy
import numpy as np
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [58]:
data_raw = pd.read_csv('standardized.csv')

In [3]:
TEXT = data.Field(
    sequential=True,
    fix_length=500,
    tokenize='spacy',
    pad_first=True,
    lower=True
)

LABEL = data.LabelField(dtype = torch.float,
                        use_vocab=False, 
                        sequential=False,
                        is_target=True)

full = data.TabularDataset('standardized.csv', 'csv', skip_header=True,
        fields=[
            ('docket', None),
            ('outcome', None),
            ('facts', TEXT),
            ('conclusion', None),
            ('target', LABEL)
        ])

TEXT.build_vocab(
    full,
    max_size=20000,
    min_freq=10,
    vectors=None
)

LABEL.build_vocab(full)

In [None]:
print(vars(full.examples[150]))

In [10]:
SEED = 189053
train_data, test_data, valid_data = full.split(split_ratio=[0.7, 0.15, 0.15], random_state = random.seed(SEED))

In [11]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 31921
Number of validation examples: 6840
Number of testing examples: 6841


In [12]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 17249
Unique tokens in LABEL vocabulary: 2


In [39]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = 'cpu',
    sort_key=lambda x: len(x.facts))

In [51]:
valid_iterator.dataset.fields

{'docket': None,
 'outcome': None,
 'facts': <torchtext.data.field.Field at 0x2db28de6348>,
 'conclusion': None,
 'target': <torchtext.data.field.LabelField at 0x2db28ae2f48>}

In [54]:
def binary_accuracy(preds, y):
    """
    Return accuracy per batch
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

class WordEmbAvg(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)  
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()                                 
        
    def forward(self, text):
        embeddings = self.embedding(text)
        embeddings_avg = embeddings.mean(0)
        output = self.linear1(embeddings_avg)
        final = self.linear2(self.relu(output))
        return final
    
class Training_module( ):

    def __init__(self, model):
        self.model = model
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.optimizer = optim.Adam(model.parameters(), lr=0.001)
        self.model.train()
    
    def train_epoch(self, iterator):
        '''
        Train the model for one epoch. For this repeat the following, 
        going through all training examples.
        1. Get the next batch of inputs from the iterator.
        2. Determine the predictions using a forward pass.
        3. Compute the loss.
        4. Compute gradients using a backward pass.
        5. Execute one step of the optimizer to update the model paramters.
        '''
        epoch_loss = 0
        epoch_acc = 0
    
        for batch in iterator:
            #batch.facts has the texts and batch.target has the labels.          
            self.optimizer.zero_grad()
            predictions = self.model(batch.facts).squeeze(1)
            loss = self.loss_fn(predictions, batch.target)                      
            accuracy = binary_accuracy(predictions, batch.target) 
                   
            loss.backward()
            self.optimizer.step()
                        
            #print(accuracy.item())
            epoch_loss += loss.item()
            epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator):
        """
        Train the model for multiple epochs, and after each evaluate on the
        development set.  Return the best performing model.
        """  
        dev_accs = [0.]
        for epoch in range(5):
            self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print(f"Epoch {epoch}: Dev Accuracy: {dev_acc[1]} Dev Loss:{dev_acc[0]}")
            if dev_acc[1] > max(dev_accs):
                best_model = copy.deepcopy(self)
            dev_accs.append(dev_acc[1])
        return best_model.model
                
    def evaluate(self, iterator):
        '''
        Evaluate the performance of the model on the given examples.
        '''
        epoch_loss = 0
        epoch_acc = 0
        
        self.model.eval()
    
        with torch.no_grad():
            for batch in iterator:
                predictions = self.model(batch.facts).squeeze(1)
                loss = self.loss_fn(predictions, batch.target)                      
                accuracy = binary_accuracy(predictions, batch.target) 

                epoch_loss += loss.item()
                epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordEmbAvg(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)

In [55]:
model = model.to()
tm = Training_module(model)

#Training the model
best_model = tm.train_model(train_iterator, valid_iterator)

Epoch 0: Dev Accuracy: 0.8379714617105288 Dev Loss:0.3875883064258879
Epoch 1: Dev Accuracy: 0.8534921563674356 Dev Loss:0.3506992033151823
Epoch 2: Dev Accuracy: 0.8621078105730431 Dev Loss:0.3413011701168301
Epoch 3: Dev Accuracy: 0.867135347049927 Dev Loss:0.3254692949285017
Epoch 4: Dev Accuracy: 0.8697847127914429 Dev Loss:0.3188413398288121


In [56]:
tm.model = best_model
test_loss, test_acc = tm.evaluate(test_iterator)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.328 | Test Acc: 86.46%


In [61]:
a = tm.model.embedding.weight.data
result = list(torch.norm(a, p=2, dim=1).numpy())

word_ls = TEXT.vocab.itos
top_n = 10
top_neg = {}
top_pos = {}

max_pos = [result.index(x) for x in sorted(result, reverse=True)][:top_n]
max_neg = [result.index(x) for x in sorted(result)][:top_n]

for i in range(top_n):
    i_p = max_pos[i]
    i_n = max_neg[i]
    top_pos[word_ls[i_p]] = result[i_p]
    top_neg[word_ls[i_n]] = result[i_n]

In [62]:
top_neg

{'<pad>': 0.0,
 '12.25': 7.0776877,
 'thedwelling': 7.299462,
 'rounded.under': 7.502485,
 'area.1the': 7.5841427,
 '24.49': 7.677865,
 '14.05': 7.731292,
 'explained': 7.735195,
 '8,415': 7.749353,
 'dividing': 7.7812696}

In [63]:
top_pos

{'fourequity': 13.493361,
 '33.16%.in': 13.157002,
 'characteristics.three': 13.042526,
 'submit': 13.038526,
 '21.48': 12.993191,
 'estimating': 12.953707,
 'equitycomparables.ptab/jbv/7-19': 12.886232,
 'complete': 12.865942,
 'ofan': 12.856337,
 'establishing': 12.792905}