In [126]:
import torch
import numpy as np
import torchvision
from torch import nn
from skimage.transform import resize
from skimage.color import rgb2gray
import pandas as pd 
from torch.nn import Linear, ReLU, BCELoss, Conv2d, Module, Sigmoid
from torch.optim import Adam
from sklearn.metrics import f1_score, precision_score, recall_score
from tqdm import tqdm
import time

In [127]:
def process(df):
    
    #get rid of black images
    df.drop(df[df['Image'].map(np.max) == 0].index,inplace=True)
    
    #get rid of rgb images
    def func(img):
        if len(img.shape)>2:
            return True
        return False
    
    df.drop(df[df['Image'].map(func)].index,inplace=True)
    
    
    #process all the available images; to grayscale,60*60,normalize,proper dimension for pytorch 
    def processImage(px_data):
        
        px_data_scaled = px_data / px_data.max()
        px_data_scaled = resize(px_data, (60, 60), anti_aliasing=True)
        px_data_scaled = px_data_scaled[None,:,:]
        return px_data_scaled    
    
    
    df.loc[:,'Image']=df.apply(lambda x: processImage(x['Image']), axis=1)
    
    #make the labels into bool Alive is True
    df.loc[:,'label'] = df['label'].map(lambda x: x == 'Alive')
    
    return df
    
    

In [128]:
class Dataset(torch.utils.data.Dataset):
    """Pet images dataset."""
    
    @staticmethod
    def from_dataframe(df):
        """
        args:
            petimages : PetImages -- The PetImages object containing the images.
        kwargs:
            size : int -- the size of the canonicalized images.
        """
        data = torch.as_tensor(np.array(df['Image'].tolist(),dtype=np.float32))
        labels = np.array(df['label'],dtype=np.int8)[:,None]

        
        return Dataset(data, labels)

    
    def __init__(self, data, labels):
        # Don't change the constructor
        self.data = data
        self.labels = labels

    def __len__(self):
        """ Return the number of images in the dataset.
        """
        return self.data.shape[0]


    def __getitem__(self, idx):
        """ Return the element corresponding to idx.
        
        args:
            idx : int -- the index of the sample to return
            
        returns: Dict -- A dictionary with two elements; "label" and "image". "label" has the associated label
            and "image" is a (size, size, 3)
        """
        # Convert it to a regular python int.
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        return {'label':self.labels[idx],'image':self.data[idx]}
    
def Dataset_load(df):
    return Dataset.from_dataframe(df)

In [129]:
def split(dataset):
    """ Split pet into train and test sets.
    
    args:
        pet : PetDataset -- the PetDataset instance to split.

    kwargs:
        train_count: The number of elements in the training set. The remainder should be in the test set.
    
    return: List[Dataset] -- the list of [train, test] datasets.
    """
    total = len(dataset)
    test = int(np.ceil(0.2*total))
    train = total-test

    
    return torch.utils.data.random_split(dataset,[train,test])

In [130]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 12, 5)
        self.relu1 = nn.ReLU()
        self.fc1   = nn.Linear(37632, 1)
        self.sigm1 = nn.Sigmoid()
    
    def forward(self, X, debug=False):
        if debug: print(f"Input Shape: {X.shape}")

        X = self.relu1(self.conv1(X))
        if debug: print(f"Conv1 Shape: {X.shape}")

        X = X.view(X.size(0), -1) # Flatten the shape
        if debug: print(f"Flattened Shape: {X.shape}")

        X = self.sigm1(self.fc1(X))
        if debug: print(f"Output Shape: {X.shape}")

        return X

def count_parameters(model):
    # Count all trainable parameters,
    # from https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/9
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [153]:
def training_loop(model, train_dataset, epochs=25, batch_size=500):
    """ Train the model on data
    """
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

#     pass # TODO: Set up
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    

    start_time = time.perf_counter()
    for epoch in range(epochs):
        # If you add the training loss to this variable, it will be printed for you
        epoch_loss = 0.0
        
        for data in train_dataloader:
            output = model(data['image'])
            loss = criterion(output,data['label'].float())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
            epoch_loss+= loss.item()
#         pass # TODO:Process all data for each epoch

        epoch += 1
        if epoch % 50 == 0:
            curr_time = time.perf_counter() - start_time
            print(f'[{curr_time:6.1f}/{curr_time/epoch*epochs:6.1f}] Epoch {epoch: <3d} loss: {epoch_loss / len(train_dataloader)} acc: {test_model(model, train_dataset)}')
    print("Done.")

In [154]:
def test_model(model, test_data, batch_size=500):
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)
    num_correct = 0
    num_alive = 0
    num_alive_correct = 0
    num_dead = 0
    num_dead_correct = 0
    for data in test_dataloader:
        y_pred = model(data["image"]).round()
        y_actual = data["label"].float()
        num_correct += (y_pred == y_actual).sum()
        for ya,yp in zip(y_actual,y_pred):
            if ya == 1.0:
                num_alive+=1
                if yp == 1.0:
                    num_alive_correct+=1
            else:
                num_dead+=1
                if yp ==0.0:
                    num_dead_correct+=1
    print(num_dead,num_alive)
    return num_correct.item() / len(test_data),(num_alive_correct/num_alive),(num_dead_correct/num_dead)

In [155]:
def trainModel(filename,epoch,batchsize):
    train_df = pd.read_pickle(filename)
    df_train = process(train_df)
    print(len(df_train))
    ds = Dataset_load(df_train)
    train_data, valid_data = tuple(split(ds))

    model = Model()

    training_loop(model, train_data,epochs = epoch,batch_size = batchsize)

    train_acc,alive_acc,dead_acc = test_model(model, train_data)
    print(f"Train accuracy: {train_acc} Alive accuracy: {alive_acc} Dead accuracy: {dead_acc}")

    test_acc,talive_acc,tdead_acc = test_model(model, valid_data)
    print(f"Test accuracy: {test_acc} Alive accuracy: {talive_acc} Dead accuracy: {tdead_acc}")
    
    return model,train_data,valid_data
    
myModel,_,_ = trainModel("train_csv.pkl",500,50)

1840
249 1223
[ 128.8/1288.5] Epoch 50  loss: 0.3350102995832761 acc: (0.8702445652173914, 0.9918233851185609, 0.27309236947791166)
249 1223
[ 259.4/1297.1] Epoch 100 loss: 0.2527327803273996 acc: (0.90625, 0.9869174161896974, 0.5100401606425703)
249 1223
[ 390.2/1300.5] Epoch 150 loss: 0.19161243960261345 acc: (0.9327445652173914, 0.9811937857726901, 0.6947791164658634)
249 1223
[ 520.3/1300.9] Epoch 200 loss: 0.1583006960650285 acc: (0.9578804347826086, 0.9901880621422731, 0.7991967871485943)
249 1223
[ 651.0/1302.0] Epoch 250 loss: 0.12499107060333094 acc: (0.970108695652174, 0.9893704006541292, 0.8755020080321285)
249 1223
[ 781.3/1302.2] Epoch 300 loss: 0.09667768155535063 acc: (0.9802989130434783, 0.9942763695829926, 0.9116465863453815)
249 1223
[ 911.7/1302.4] Epoch 350 loss: 0.07481369568655888 acc: (0.985733695652174, 0.9959116925592805, 0.9357429718875502)
249 1223
[1042.2/1302.7] Epoch 400 loss: 0.05849616297831138 acc: (0.9891304347826086, 0.9967293540474244, 0.951807228915

In [157]:
test_df = pd.read_pickle('test_data.pkl')
df_test = process(test_df)
print(len(df_test))
ds = Dataset_load(df_test)
test_model(myModel,ds)

465
24 441


(0.853763440860215, 0.9002267573696145, 0.0)