In [53]:
import pandas as pd
import csv
import torch
import copy
import numpy as np
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#based on homework 2 framework

In [58]:
data_raw = pd.read_csv('standardized.csv')

In [116]:
TEXT = data.Field(
    sequential=True,
    fix_length=500,
    tokenize='spacy',
    pad_first=True,
    lower=True
)

LABEL = data.LabelField(dtype = torch.float,
                        use_vocab=False, 
                        sequential=False,
                        is_target=True)

full = data.TabularDataset('standardized.csv', 'csv', skip_header=True,
        fields=[
            ('docket', None),
            ('outcome', None),
            ('facts', TEXT),
            ('conclusion', None),
            ('target', LABEL)
        ])

TEXT.build_vocab(
    full,
    max_size=20000,
    min_freq=100,
    vectors=None
)

LABEL.build_vocab(full)

In [117]:
print(vars(full.examples[150]))

{'facts': ['  ', 'the', 'subject', 'property', 'consists', 'of', 'a', '6,250', 'parcel', 'of', 'land', 'improved', 'with', 'a', '95-year', 'old', ',', 'one', '-', 'story', ',', 'commercial', 'building', 'containing', '3,178', 'square', 'feet', 'of', 'building', 'area', '.', ' ', 'the', 'property', 'is', 'located', 'in', 'chicago', ',', 'jefferson', 'township', ',', 'cook', 'county', '.', ' ', 'the', 'property', 'is', 'a', 'docket', 'no', ':', '09', '-', '24022.001-c-1', '   ', '2', 'of', '4', 'class', '5', 'property', 'under', 'the', 'cook', 'county', 'real', 'property', 'assessment', 'classification', 'ordinance', '.', '  ', 'the', 'appellant', 'contends', 'overvaluation', 'as', 'the', 'basis', 'of', 'the', 'appeal', '.', 'in', 'support', 'of', 'this', 'argument', 'the', 'appellant', 'submitted', 'an', 'appraisal', 'estimating', 'the', 'subject', 'property', 'had', 'a', 'market', 'value', 'of', '$', '238,000', 'as', 'of', 'january', '1', ',', '2009', '.', '  ', 'the', 'board', 'of', '

In [118]:
SEED = 189053
train_data, test_data, valid_data = full.split(split_ratio=[0.7, 0.15, 0.15], random_state = random.seed(SEED))

In [119]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 31921
Number of validation examples: 6840
Number of testing examples: 6841


In [120]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

#13114 unique @ min_freq 15
#8700 unique @ min_freq 25
#6186 unique @ min_freq 35
#4533 unique @ min_freq 50
#3352 unique @ min_freq 75
#2799 unique @ min_freq 100

Unique tokens in TEXT vocabulary: 2799
Unique tokens in LABEL vocabulary: 2


In [121]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = 'cpu',
    sort_key=lambda x: len(x.facts))

In [122]:
valid_iterator.dataset.fields

{'docket': None,
 'outcome': None,
 'facts': <torchtext.data.field.Field at 0x2dc1f7df388>,
 'conclusion': None,
 'target': <torchtext.data.field.LabelField at 0x2dc28679588>}

In [123]:
def binary_accuracy(preds, y):
    """
    Return accuracy per batch
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

class WordEmbAvg(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)  
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()                                 
        
    def forward(self, text):
        embeddings = self.embedding(text)
        embeddings_avg = embeddings.mean(0)
        output = self.linear1(embeddings_avg)
        final = self.linear2(self.relu(output))
        return final
    
class Training_module( ):

    def __init__(self, model):
        self.model = model
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.optimizer = optim.Adam(model.parameters(), lr=0.001)
        self.model.train()
    
    def train_epoch(self, iterator):
        '''
        Train the model for one epoch. For this repeat the following, 
        going through all training examples.
        1. Get the next batch of inputs from the iterator.
        2. Determine the predictions using a forward pass.
        3. Compute the loss.
        4. Compute gradients using a backward pass.
        5. Execute one step of the optimizer to update the model paramters.
        '''
        epoch_loss = 0
        epoch_acc = 0
    
        for batch in iterator:
            #batch.facts has the texts and batch.target has the labels.          
            self.optimizer.zero_grad()
            predictions = self.model(batch.facts).squeeze(1)
            loss = self.loss_fn(predictions, batch.target)                      
            accuracy = binary_accuracy(predictions, batch.target) 
                   
            loss.backward()
            self.optimizer.step()
                        
            #print(accuracy.item())
            epoch_loss += loss.item()
            epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator):
        """
        Train the model for multiple epochs, and after each evaluate on the
        development set.  Return the best performing model.
        """  
        dev_accs = [0.]
        for epoch in range(5):
            self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print(f"Epoch {epoch}: Dev Accuracy: {dev_acc[1]} Dev Loss:{dev_acc[0]}")
            if dev_acc[1] > max(dev_accs):
                best_model = copy.deepcopy(self)
            dev_accs.append(dev_acc[1])
        return best_model.model
                
    def evaluate(self, iterator):
        '''
        Evaluate the performance of the model on the given examples.
        '''
        epoch_loss = 0
        epoch_acc = 0
        
        self.model.eval()
    
        with torch.no_grad():
            for batch in iterator:
                predictions = self.model(batch.facts).squeeze(1)
                loss = self.loss_fn(predictions, batch.target)                      
                accuracy = binary_accuracy(predictions, batch.target) 

                epoch_loss += loss.item()
                epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordEmbAvg(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)

In [124]:
model = model.to()
tm = Training_module(model)

#Training the model
best_model = tm.train_model(train_iterator, valid_iterator)

Epoch 0: Dev Accuracy: 0.830816087878753 Dev Loss:0.4032839011644649
Epoch 1: Dev Accuracy: 0.848819259171174 Dev Loss:0.35916098353461684
Epoch 2: Dev Accuracy: 0.8560372162088056 Dev Loss:0.3438979541169149
Epoch 3: Dev Accuracy: 0.859062082856615 Dev Loss:0.3351214023951058
Epoch 4: Dev Accuracy: 0.8627336448598131 Dev Loss:0.32859058538886987


In [125]:
tm.model = best_model
test_loss, test_acc = tm.evaluate(test_iterator)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.333 | Test Acc: 86.04%


In [126]:
a = tm.model.embedding.weight.data
result = list(torch.norm(a, p=2, dim=1).numpy())

word_ls = TEXT.vocab.itos
top_n = 25
top_neg = {}
top_pos = {}

max_pos = [result.index(x) for x in sorted(result, reverse=True)][:top_n]
max_neg = [result.index(x) for x in sorted(result)][:top_n]

for i in range(top_n):
    i_p = max_pos[i]
    i_n = max_neg[i]
    top_pos[word_ls[i_p]] = result[i_p]
    top_neg[word_ls[i_n]] = result[i_n]

In [127]:
top_neg

{'<pad>': 0.0,
 'landassessment': 7.8902884,
 'nos': 8.041609,
 '1971': 8.113046,
 'joliet': 8.1292515,
 'which': 8.129589,
 'condominiums': 8.170181,
 'percent': 8.186706,
 '24': 8.204296,
 'with': 8.207816,
 'regards': 8.239091,
 'propertyhas': 8.249024,
 '    ': 8.27115,
 'iii': 8.284628,
 '.15': 8.309838,
 '315,000': 8.311799,
 'sheet': 8.382522,
 'english': 8.401081,
 '23,000': 8.402858,
 'auto': 8.418486,
 '352': 8.433661,
 '58-year': 8.436627,
 'skokie': 8.441336,
 '1,232': 8.450788,
 '520': 8.46168}

In [128]:
top_pos

{'submit': 14.341018,
 'estimating': 13.694119,
 'valuation': 13.666112,
 'rebuttal': 13.294349,
 'fourequity': 13.2800455,
 'insupport': 13.203797,
 'limited': 12.991287,
 'frankfort': 12.96951,
 'purported': 12.940401,
 'correctly': 12.932102,
 'aurora': 12.825233,
 '105': 12.789168,
 'stipulate': 12.7762,
 'comparables1': 12.71342,
 'proposed': 12.635103,
 'but': 12.51256,
 'rendered': 12.4135895,
 'meadows': 12.37192,
 'applying': 12.351024,
 'comparables.in': 12.338044,
 'complete': 12.327074,
 'described': 12.323813,
 'matter': 12.322682,
 'madison': 12.310271,
 'effect': 12.308301}