In [1]:
import pandas as pd
import csv
import torch
import copy
import numpy as np
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#based on homework 2 framework

In [2]:
data_raw = pd.read_csv('standardized.csv')
decisions = pd.read_csv('decisions.csv')

In [120]:
TEXT = data.Field(
    sequential=True,
    fix_length=1000,
    tokenize='spacy',
    pad_first=True,
    lower=True
)

LABEL = data.LabelField(dtype = torch.float,
                        use_vocab=False, 
                        sequential=False,
                        is_target=True)

full = data.TabularDataset('standardized.csv', 'csv', skip_header=True,
        fields=[
            ('docket', None),
            ('outcome', None),
            ('facts', TEXT),
            ('conclusion', None),
            ('target', LABEL)
        ])

TEXT.build_vocab(
    full,
    max_size=20000,
    min_freq=150,
    vectors=None
)

LABEL.build_vocab(full)

In [128]:
print(vars(full.examples[38000]))

{'facts': ['  ', 'the', 'subject', 'property', 'consists', 'of', 'a', 'split', '-', 'level', 'dwelling', 'of', 'frame', 'exterior', 'construction', 'with', '925', 'square', 'feet', 'of', 'living', 'area', '.', ' ', 'the', 'dwelling', 'was', 'constructed', 'in', '1973', '.', ' ', 'features', 'of', 'the', 'home', 'include', 'a', 'finished', 'lower', 'level', 'and', 'a', '400', 'square', 'foot', 'garage', '.', ' ', 'the', 'property', 'has', 'a', '5,480', 'square', 'foot', 'site', 'and', 'is', 'located', 'in', 'round', 'lake', 'beach', ',', 'avon', 'township', ',', 'lake', 'county', '.', ' ', 'the', 'appellants', 'contend', 'overvaluation', 'as', 'the', 'basis', 'of', 'the', 'appeal', '.', ' ', 'in', 'support', 'of', 'this', 'argument', 'the', 'appellant', 'submitted', 'information', 'on', 'three', 'comparable', 'sales', 'located', 'from', '1.82', 'to', '2.42', 'miles', 'from', 'the', 'subject', '.', ' ', 'the', 'comparables', 'consist', 'of', 'dwellings', 'of', 'frame', 'exterior', 'const

In [123]:
SEED = 189053
train_data, test_data, valid_data = full.split(split_ratio=[0.7, 0.15, 0.15], random_state = random.seed(SEED))

In [124]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 31921
Number of validation examples: 6840
Number of testing examples: 6841


In [125]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

#13114 unique @ min_freq 15
#8700 unique @ min_freq 25
#6186 unique @ min_freq 35
#4533 unique @ min_freq 50
#3352 unique @ min_freq 75
#2799 unique @ min_freq 100
#2173 unique @ min_freq 100

Unique tokens in TEXT vocabulary: 2173
Unique tokens in LABEL vocabulary: 2


In [78]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = 'cpu',
    sort_key=lambda x: len(x.facts))

In [80]:
def binary_accuracy(preds, y):
    """
    Return accuracy per batch
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

class WordEmbAvg(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)  
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()                                 
        
    def forward(self, text):
        embeddings = self.embedding(text)
        embeddings_avg = embeddings.mean(0)
        output = self.linear1(embeddings_avg)
        final = self.linear2(self.relu(output))
        return final
    
class TrainingModule():

    def __init__(self, model):
        self.model = model
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.optimizer = optim.Adam(model.parameters(), lr=0.001)
        self.model.train()
    
    def train_epoch(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
    
        for batch in iterator:
            #batch.facts has the texts and batch.target has the labels.          
            self.optimizer.zero_grad()
            predictions = self.model(batch.facts).squeeze(1)
            loss = self.loss_fn(predictions, batch.target)                      
            accuracy = binary_accuracy(predictions, batch.target) 
                   
            loss.backward()
            self.optimizer.step()
                        
            epoch_loss += loss.item()
            epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator):
        dev_accs = [0.]
        for epoch in range(5):
            self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print(f"Epoch {epoch}: Dev Accuracy: {dev_acc[1]} Dev Loss:{dev_acc[0]}")
            if dev_acc[1] > max(dev_accs):
                best_model = copy.deepcopy(self)
            dev_accs.append(dev_acc[1])
        return best_model.model
                
    def evaluate(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        
        self.model.eval()
    
        with torch.no_grad():
            for batch in iterator:
                predictions = self.model(batch.facts).squeeze(1)
                loss = self.loss_fn(predictions, batch.target)                      
                accuracy = binary_accuracy(predictions, batch.target) 

                epoch_loss += loss.item()
                epoch_acc += accuracy.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 50
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordEmbAvg(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)

In [81]:
model = model.to()
tm = TrainingModule(model)

best_model = tm.train_model(train_iterator, valid_iterator)

Epoch 0: Dev Accuracy: 0.836552904030987 Dev Loss:0.3811733804852049
Epoch 1: Dev Accuracy: 0.8514060411497811 Dev Loss:0.34807823500900625
Epoch 2: Dev Accuracy: 0.856892523364486 Dev Loss:0.33633933606270316
Epoch 3: Dev Accuracy: 0.8658419557820971 Dev Loss:0.3261891924471499
Epoch 4: Dev Accuracy: 0.8694717956480579 Dev Loss:0.31996052183001955


In [82]:
tm.model = best_model
test_loss, test_acc = tm.evaluate(test_iterator)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.325 | Test Acc: 86.21%


In [129]:
a = tm.model.embedding.weight.data
result = list(torch.norm(a, p=2, dim=1).numpy())

word_ls = TEXT.vocab.itos
top_n = 20
top_neg = {}
top_pos = {}

max_pos = [result.index(x) for x in sorted(result, reverse=True)][:top_n]
max_neg = [result.index(x) for x in sorted(result)][:top_n]

for i in range(top_n):
    i_p = max_pos[i]
    i_n = max_neg[i]
    top_pos[word_ls[i_p]] = result[i_p]
    top_neg[word_ls[i_n]] = result[i_n]

In [130]:
top_neg

{'<pad>': 0.0,
 'front': 12.02051,
 'fullbasement': 12.048797,
 '125,000': 12.0652485,
 'types': 12.140268,
 '24': 12.229512,
 'pleadings': 12.2781,
 'estimates': 12.288643,
 '3,125': 12.36296,
 'parcels': 12.36658,
 'dwellingswere': 12.431138,
 'comps': 12.4603,
 '77': 12.502497,
 '13': 12.50694,
 'collection': 12.520863,
 'an': 12.545551,
 'subarea': 12.576534,
 'rooms': 12.58088,
 'public': 12.593324,
 'includes': 12.609003}

In [131]:
top_pos

{'limited': 19.90613,
 'foreclosure': 19.026604,
 'fourequity': 18.132935,
 'proposed': 17.663214,
 'valuation': 17.593716,
 'timely': 17.452204,
 'correctly': 17.370247,
 'bases': 17.180614,
 'townhome': 17.082584,
 'submit': 17.054323,
 'establishing': 16.894701,
 'e': 16.739061,
 'expense': 16.662413,
 'frankfort': 16.6024,
 '8.94': 16.593983,
 '1900': 16.55477,
 'aurora': 16.547735,
 'agreed': 16.519125,
 'substantive': 16.513819,
 'insupport': 16.50307}

# Alternate Evaluation Dimensions

In [86]:
joined = pd.merge(data_raw, decisions, left_on='docket', right_on='docket_name')
joined['year'] = joined.date.str.slice(0,4)

In [87]:
def eval_subset(joined, attr, val):
    bool_list = joined[attr] == val
    bool_list = bool_list.tolist()
    filtered_full = [i for (i, val) in zip(full.examples, bool_list) if val]

    mini_full = data.Dataset(
        filtered_full,
        fields=[('facts', TEXT),('target', LABEL)])

    filtered_iterator = data.Iterator(
        mini_full, 
        batch_size = BATCH_SIZE,
        device = 'cpu',
        sort_key=lambda x: len(x.facts))

    test_loss, test_acc = tm.evaluate(filtered_iterator)
    print("~~~~")
    print("Analyzing data where {} is {}".format(attr, val))
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
    print(f'Number of cases: {joined[bool_list].shape[0]}')
    print(f'Percent of cases with reductions: {joined[bool_list].target.mean()*100:.2f}%')

In [133]:
joined.target.mean()

0.4166429244559381

In [88]:
eval_subset(joined, 'county', 'Cook')

~~~~
Analyzing data where county is Cook
Test Loss: 0.286 | Test Acc: 88.38%
Number of cases: 36073
Percent of cases with reductions: 44.75%


In [89]:
eval_subset(joined, 'appellant', 'Mack Companies')

~~~~
Analyzing data where appellant is Mack Companies
Test Loss: 0.256 | Test Acc: 91.21%
Number of cases: 281
Percent of cases with reductions: 2.14%


In [90]:
eval_subset(joined, 'appellant', 'Inverclyde, LLC')

~~~~
Analyzing data where appellant is Inverclyde, LLC
Test Loss: 0.422 | Test Acc: 82.03%
Number of cases: 128
Percent of cases with reductions: 37.50%


In [91]:
eval_subset(joined, 'county', 'Lake')

~~~~
Analyzing data where county is Lake
Test Loss: 0.378 | Test Acc: 86.05%
Number of cases: 3487
Percent of cases with reductions: 14.11%


In [92]:
eval_subset(joined, 'reason_code', '2')

~~~~
Analyzing data where reason_code is 2
Test Loss: 0.310 | Test Acc: 87.28%
Number of cases: 42338
Percent of cases with reductions: 41.70%


In [93]:
eval_subset(joined, 'reason_code', '1')

~~~~
Analyzing data where reason_code is 1
Test Loss: 0.363 | Test Acc: 83.70%
Number of cases: 3195
Percent of cases with reductions: 40.85%


In [94]:
eval_subset(joined, 'prop_type', 'R')

~~~~
Analyzing data where prop_type is R
Test Loss: 0.318 | Test Acc: 86.92%
Number of cases: 41491
Percent of cases with reductions: 37.86%


In [95]:
eval_subset(joined, 'prop_type', 'C')

~~~~
Analyzing data where prop_type is C
Test Loss: 0.278 | Test Acc: 87.18%
Number of cases: 3230
Percent of cases with reductions: 79.13%


In [96]:
eval_subset(joined, 'prop_type', 'I')

~~~~
Analyzing data where prop_type is I
Test Loss: 0.238 | Test Acc: 91.26%
Number of cases: 847
Percent of cases with reductions: 83.12%


In [97]:
eval_subset(joined, 'valuation_class', 1)

~~~~
Analyzing data where valuation_class is 1
Test Loss: 0.313 | Test Acc: 87.07%
Number of cases: 45125
Percent of cases with reductions: 41.40%


In [98]:
eval_subset(joined, 'valuation_class', 2)

~~~~
Analyzing data where valuation_class is 2
Test Loss: 0.405 | Test Acc: 80.48%
Number of cases: 370
Percent of cases with reductions: 60.27%


In [99]:
eval_subset(joined, 'year', '2016')

~~~~
Analyzing data where year is 2016
Test Loss: 0.304 | Test Acc: 87.86%
Number of cases: 10473
Percent of cases with reductions: 50.55%


In [100]:
eval_subset(joined, 'year', '2017')

~~~~
Analyzing data where year is 2017
Test Loss: 0.318 | Test Acc: 86.71%
Number of cases: 8293
Percent of cases with reductions: 38.02%


In [101]:
eval_subset(joined, 'year', '2018')

~~~~
Analyzing data where year is 2018
Test Loss: 0.285 | Test Acc: 88.62%
Number of cases: 7399
Percent of cases with reductions: 30.42%


In [102]:
eval_subset(joined, 'year', '2019')

~~~~
Analyzing data where year is 2019
Test Loss: 0.323 | Test Acc: 87.12%
Number of cases: 7490
Percent of cases with reductions: 24.43%


In [103]:
eval_subset(joined, 'year', '2020')

~~~~
Analyzing data where year is 2020
Test Loss: 0.358 | Test Acc: 85.61%
Number of cases: 2721
Percent of cases with reductions: 22.79%
