In [3]:
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms, utils
import time
import copy
from torch.autograd import Variable
import math

# Process Data

In [4]:
# Convert strings to dictionaries to embed

class Factors():
    def __init__(self):
        self.tok2ind = {}
        self.ind2tok = {}
    
    def add(self, token):
        if token not in self.tok2ind:
            index = len(self.tok2ind)
            self.tok2ind[token] = index
            self.ind2tok[index] = token
    
    def __len__(self):
        return len(self.tok2ind)
    
    def get_index(self, word):
        if word in self.tok2ind:
            return self.tok2ind[word]
        return self.tok2ind[self.UNKNOWN]
    
    def get_word(self, i):
        return self.ind2tok[i]

    def sentence_to_indices(self, sentence):
        words = [x for x in sentence.split(' ')]
        indices = [self.get_index(w) for w in words]
        return indices
    
# boiler plate code for general sentences but works for our purposes too
def build_factors(examples):
    counts = Counter()
    for ex in examples:
        words = [w for w in ex.split(' ') if w.strip()]
        counts.update(words)
    
    word_list = [w for w in counts if counts[w] > 1]
    
    word_dict = Vocabulary()
    for w in word_list:
        word_dict.add(w)
    return word_dict


In [76]:
# Data Reader Class

class DataReader(Dataset):
    def __init__(self):
        
        # Read in dataframe with proper headers and col dtypes, and remove dups
        self.train_data = pd.read_csv("train.csv", nrows = 50000, index_col = 0).drop(["action_taken_name",
                                                                                      "agency_name",
                                                                                      "state_name"], axis = 1)
        self.test_data = pd.read_csv("test.csv", nrows = 1000, index_col = 0).drop(["action_taken_name",
                                                                                      "agency_name",
                                                                                      "state_name"], axis = 1)
        cats = ["state_abbr", "purchaser_type_name", "property_type_name", "preapproval_name",
                "owner_occupancy_name", "msamd_name", "loan_type_name", "loan_purpose_name",
                "lien_status_name", "hoepa_status_name", "denial_reason_name_1", "county_name",
                "co_applicant_sex_name", "co_applicant_race_name_1", "co_applicant_ethnicity_name",
                "applicant_sex_name", "applicant_race_name_1", "applicant_ethnicity_name", "agency_abbr",
                "approved"]
        
        # convert categorical strings to classes
        for cat in cats:
            self.train_data[cat] = pd.Categorical(self.train_data[cat]).codes
            self.test_data[cat] = pd.Categorical(self.test_data[cat]).codes

        self.train_x = np.asarray(self.train_data.iloc[:, 0:-1])
        self.train_y = np.asarray(self.train_data.iloc[:, -1])
        print("Read %d train samples" % len(self.train_y))


        self.test_x = np.asarray(self.test_data.iloc[:, 0:-1])
        self.test_y = np.asarray(self.test_data.iloc[:, -1])
        print("Read %d test samples" % len(self.test_y))

        
        # get meta
        self.num_classes = 2
        self.input_size = self.train_x.shape[1]
        self.indexes = list(range(len(self.train_y)))
        self.train_size = len(self.train_y)
        
    def inputSize(self):
        return self.input_size

    def init(self, batch_size):
        # shuffle
        self.batch_size = batch_size
        np.random.shuffle(self.indexes)
        return int(math.ceil(self.train_size / float(batch_size)))

    def get_batch(self, i):
        selected_idx = self.indexes[i*self.batch_size : (i+1)*self.batch_size]
        return self.train_x[selected_idx, :], self.train_y[selected_idx]

df = DataReader()
print(df.train_x[1])

Read 50000 train samples
Read 1000 test samples
[nan nan nan nan nan nan 51.0 nan 62.0 41 nan 7 0 0 2 -1 0 1 1 1 nan nan -1
 -1 0 6 3 nan 2017 1 4 1 4]


# Building Model

In [72]:
class FNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.non_linear = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        var_x = Variable(torch.from_numpy(x).float())
        logitis = self.fc2(self.non_linear(self.fc1(var_x)))
        return nn.functional.log_softmax(logitis, dim=1)

# Evaluating Model

In [73]:
def eval(data_x, data_y, model, loss_func, name):
    model.eval()
    log_py = model(data_x)
    y = Variable(torch.from_numpy(data_y).long())
    l = loss_func(log_py, y).item()
    
    pred = np.argmax(log_py.data.numpy(), axis=1)
    acc = np.mean(pred == data_y)
    print("%s loss %f and acc %f " % (name, l, acc))
    return l, acc

# Training

In [74]:
NUM_EPOCH = 20
BATCH_SIZE = 32
LR = 0.1
MOMENTUM = 0.0
L2_DECAY = 0.0

data_loader = DataReader()
model = FNN(data_loader.inputSize(), hidden_size=300, output_size=2)
nll_loss = nn.NLLLoss()
op = torch.optim.SGD(model.parameters(), lr=LR, 
                     momentum=MOMENTUM, weight_decay=L2_DECAY)

train_metric, test_metric = [], []
for i in range(NUM_EPOCH):
    print("Epoch %d" % i)
    num_batches = data_loader.init(batch_size=BATCH_SIZE)
    model.train()
    for b in range(num_batches):
        x, y = data_loader.get_batch(b)
        pred_y = model(x)
        y = Variable(torch.from_numpy(y).long())
        loss = nll_loss(pred_y, y)
        op.zero_grad()
        loss.backward()
        op.step()

    train_metric.append(eval(data_loader.train_x, data_loader.train_y, model, nll_loss, "TRAIN"))
    dev_metric.append(eval(data_loader.dev_x, data_loader.dev_y, model, nll_loss, "TEST"))

Read 50000 train samples
Read 1000 test samples
Epoch 0


RuntimeError: can't convert a given np.ndarray to a tensor - it has an invalid type. The only supported types are: double, float, int64, int32, and uint8.