In [1]:
import io

import boto3
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
import torch.onnx
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Retrieve the data

In [2]:
s3_client = boto3.client('s3')
data_bucket_name='datadan'

In [3]:
response = s3_client.get_object(Bucket=data_bucket_name, Key='practical-ai/section3/grad_school.csv')
response_body = response["Body"].read()
data = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)
data.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


# Pre-process data

In [4]:
# scale the loan data
X = data[['gre','gpa','rank']]
X = MinMaxScaler().fit_transform(X)
data_scaled = pd.DataFrame(X, columns=['gre','gpa','rank']).join(data['admit'])

# split the data into training and test sets
train, test = train_test_split(data_scaled, test_size=0.2)

train.head()

Unnamed: 0,gre,gpa,rank,admit
140,0.724138,0.95977,0.333333,0
237,0.448276,1.0,0.333333,0
274,0.517241,0.494253,0.333333,0
131,0.724138,0.304598,0.333333,0
73,0.62069,1.0,0.333333,0


In [5]:
print(len(train))
print(len(test))

320
80


# Train and Export Logistic Regression Model

## Model Definition

In [6]:
class LogRegModel(torch.nn.Module):

    def __init__(self, input_dim, output_dim):

        # call class constructor
        super(LogRegModel, self).__init__()

        # use the nn package to create a linear layer
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):

        # First we pass the input through the linear layer (like we did before),
        # then we pass that through sigmoid, which implements the logistic
        # function.
        out = torch.sigmoid(self.linear(x))
        return out

In [7]:
def log_reg_fit(x, y, learning_rate, epochs):
    """
    Train a logistic regresson model using SGD and pytorch.

    Args:
        x - feature array, a numpy array
        y - response array, a numpy array
        learning_rate - learning rate used in SGD
        epochs - number of epochs for the SGD loop
    Returns:
        The trained model
    """

    # define the number of features that we expect as input
    # (in input_dimension), and the number of output features
    # (in output_dimension).
    input_dimension = x.ndim
    output_dimension = y.ndim

    # prep the shapes of x and y for pytorch
    if input_dimension == 1:
        x = x[:, np.newaxis]
    else:
        input_dimension = x.shape[1]
    if output_dimension == 1:
        y = y[:, np.newaxis]
    else:
        output_dimension = y.shape[1]

    # initialize the model
    model = LogRegModel(input_dimension, output_dimension)

    # our error/loss function
    criterion = torch.nn.BCELoss()

    # define our SGD optimizer
    optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # loop over our epochs, similar to our previous implementation
    for epoch in range(epochs):

        # increment the epoch count
        epoch +=1

        # define our feature and response variables
        features = Variable(torch.from_numpy(x).float())
        labels = Variable(torch.from_numpy(y).float())

        # clear the gradients
        optimiser.zero_grad()

        # calculate the predicted values
        predictions = model.forward(features)

        # calculate our loss
        loss = criterion(predictions, labels)

        # implement our gradient-based updates to our
        # parammeters (putting them "back" into the model
        # via a "backward" update)
        loss.backward()
        optimiser.step()

    return model

## Model Training and Testing

In [8]:
# train our model
model = log_reg_fit(train[['gre','gpa','rank']].values, train['admit'].values, 0.1, 10000)

# make predictions on our test data
raw_predictions = model(Variable(torch.from_numpy(test[['gre','gpa','rank']].values).float()))
predictions = []
for prediction in raw_predictions:
    if prediction.data.numpy()[0] > 0.50:
        predictions.append(1.0)
    else:
        predictions.append(0.0)

# calculate our accuracy
acc = accuracy_score(test['admit'].values, predictions)
print('Accuracy: ', acc)

Accuracy:  0.6625


## Export ONNX model

In [9]:
# Create a dummpy input for the model export, such
# that it know the shape of the expected input. This
# can be dummy data or you can use real values.
dummy_input = Variable(torch.from_numpy(test[['gre','gpa','rank']].values).float())
torch.onnx.export(model, dummy_input, "log_reg.onnx")