# Code to split the training data

It is easier to zip the content and upload it onto google drive instead of directly uploading the images.

In [12]:
%cd '/content/drive/MyDrive/data/fruits_360'

/content/drive/MyDrive/data/fruits_360


In [13]:
%ls -l

total 412667
drwx------   2 root root      4096 Sep 11 20:33 [0m[01;34mTest[0m/
-rw-------   1 root root 102891327 Sep 11 16:12 Test.zip
drwx------ 129 root root      4096 Sep 11 16:09 [01;34mTraining[0m/
-rw-------   1 root root   5202043 Sep 11 20:33 Training.csv
-rw-------   1 root root 313167483 Sep 11 16:04 Training.zip
-rw-------   1 root root   1300178 Sep 11 20:33 Validation.csv


In [14]:
# !unzip Test.zip
src = '/content/drive/MyDrive/data/fruits_360'
import os

filenames = os.listdir(src + "/Test")
len(filenames)

96

In [15]:
import os
import pandas as pd

main_dir = '/content/drive/MyDrive/data/fruits_360/'
model_dir = '/content/drive/MyDrive/models/'

In [16]:
#This piece of code was used to extract the class names for the image
#Would just need to split the refer to the correct folder to generate the right
#type of data. Example change field data_type to 'Train'

data_type = 'Test'
df = pd.DataFrame(columns=["filename", "filepath","class_name", "class"])
folder_names = os.listdir(main_dir + data_type)

class_names = []
idx = -1
for folder_name in folder_names:
  class_name = folder_name.replace(" ","_")
  filenames = os.listdir(main_dir + data_type + "/" + folder_name)

  if class_name not in class_names:
    class_names.append(class_name)
    idx += 1

  for filename in filenames:
    row = [filename, 
           main_dir + data_type + "/" + folder_name + "/" + filename,
           class_name,
           idx]

    df.loc[len(df)] = row

In [None]:
#For training and validation data

#Shuffle the training dataset
df = df.sample(frac=1).reset_index(drop=True)

#Split the training dataset into train and validation
train_df = df[:52208]
val_df = df[52208:]

train_df.to_csv(main_dir + "Training.csv", index=False)
val_df.to_csv(main_dir + "Validation.csv", index=False)

In [17]:
#For testing data
df.to_csv(main_dir + "Test.csv", index=False)

# Setting the model architecture 

In [1]:
#Hacked version of resnet with Gropu Normalization 
#Will need to rework this to a better solution
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import pickle
import time
from PIL import Image
from sklearn.metrics import recall_score, precision_score
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
main_dir = '/content/drive/MyDrive/data/fruits_360/'
model_dir = '/content/drive/MyDrive/models/'

# sys.path.append('/content/drive/My Drive/models')
# from resnet import resnet50

In [30]:
net = models.resnet50()
# net.load_state_dict(torch.load(model_dir + "resnet50.pth"))

in_features = net.fc.in_features
net.fc = nn.Linear(in_features, 131)

net.load_state_dict(torch.load(model_dir + 'res50_fruit_ep2.pth' ))

ce_loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9)

net = net.to(device)

In [None]:
# for name, parameters in net.named_parameters():
#   print(name + " " + str(parameters.requires_grad))

# Set the Dataset and Dataloaders

In [7]:

class FruitDataset(Dataset):

    def __init__(self, csv_file, root_dir, transforms=None):
        self.root_dir = root_dir
        self.csv_file = pd.read_csv(main_dir + csv_file)
        self.transforms = transforms
        self.c = 131

    def __len__(self):
        return len(self.csv_file)

    def __getitem__(self, idx):
        image_det = self.csv_file.iloc[idx]
        image = Image.open(image_det['filepath'])
        label = torch.tensor(image_det['class'])

        if self.transforms:
            image = self.transforms(image)

        return image, label


In [19]:
train_data_transforms = transforms.Compose([ transforms.Resize((224, 224)),
                                              transforms.RandomRotation((-30, 30)),
                                              transforms.RandomHorizontalFlip(),
                                              transforms.ToTensor(),
                                              transforms.Normalize(
                                                  mean=[0.485, 0.456, 0.406],
                                                  std=[0.229, 0.224, 0.225])
                                              ])


test_data_transforms = transforms.Compose([ transforms.Resize((224, 224)),
                                              transforms.ToTensor(),
                                              transforms.Normalize(
                                                  mean=[0.485, 0.456, 0.406],
                                                  std=[0.229, 0.224, 0.225])
                                              ])


train_data_set = FruitDataset('Training.csv', main_dir, train_data_transforms)
val_data_set = FruitDataset('Validation.csv', main_dir, train_data_transforms)
test_data_set = FruitDataset('Test.csv', main_dir, test_data_transforms)

batch_data_loader = {'Train' : DataLoader(train_data_set, batch_size=128, shuffle=True),
                     'Val' : DataLoader(val_data_set, batch_size=32),
                     'Test' : DataLoader(val_data_set, batch_size=32)}

data_sizes = {'Train' : len(train_data_set), 
              'Val' : len(val_data_set),
              'Test' : len(test_data_set)}

print(data_sizes)
print(device)

{'Train': 52208, 'Val': 13050, 'Test': 13695}
cuda


# Functions to train, test and provide evaluation results

In [28]:
def evaluate(phase, running_loss, running_corrects, targets, pred_labels):

  print(flush=True)

  epoch_loss = running_loss / data_sizes[phase]
  epoch_accuracy = running_corrects.double() / data_sizes[phase]
  recall = recall_score(targets, pred_labels, pos_label=0, average='micro')
  precision = precision_score(targets, pred_labels, pos_label=0, average='micro')

  print("{} Loss :{}, Recall : {}, Precision : {}"
        .format(phase, epoch_loss, recall, precision), flush=True)
      


#Training loop
def train_one_epoch(phase, model, train_dataloader):

    running_loss = 0.0
    running_corrects = 0.0
    pred_labels = []
    targets = []

    iterations = int(data_sizes[phase] / train_dataloader.batch_size)

    for batch_idx, batch_data in enumerate(train_dataloader):

        sys.stdout.write('\r')
        sys.stdout.write("{} Iteration :{}/{}"
                          .format(phase, batch_idx + 1, iterations))

        inputs = batch_data[0]
        # Pytorch autograd seems to work only with float labels not int values
        labels = batch_data[1]
        # labels = labels.type(torch.LongTensor)

        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()  # Don't want gradients to accumulate

        with torch.set_grad_enabled(phase == 'Train'):
            outputs = model(inputs)  # Outputs are probabilities

            loss = ce_loss(outputs, labels)  # Calculate loss
            y_pred_softmax = torch.log_softmax(outputs, dim=1)
            _, pred = torch.max(y_pred_softmax, dim=1)

            loss.backward()  # Calculate gradient for trainable each node
            optimizer.step()  # Update weights with gradient

        # Loss per batch is accumulated
        running_loss += loss.item() * inputs.size(0)
        #For accuracy
        running_corrects += torch.sum(pred == labels)

        pred_labels += pred.detach().cpu().numpy().tolist()
        targets +=  labels.detach().cpu().numpy().tolist()


    # evaluate(running_loss, running_corrects, targets, pred_labels)
    return model



def test_model(phase, model, dataloader):

  running_loss = 0.0
  running_corrects = 0.0
  pred_labels = []
  targets = []

  iterations = int(data_sizes[phase] / dataloader.batch_size)

  for batch_idx, batch_data in enumerate(dataloader):

      sys.stdout.write('\r')
      sys.stdout.write("{} Iteration :{}/{}"
                        .format(phase, batch_idx + 1, iterations))


      inputs = batch_data[0]
      # Pytorch autograd seems to work only with float labels not int values
      labels = batch_data[1]
      labels = labels.type(torch.LongTensor)

      inputs = inputs.to(device)
      labels = labels.to(device)
          

      with torch.set_grad_enabled(phase == 'Train'):
          outputs = model(inputs)  # Outputs are probabilities

          loss = ce_loss(outputs, labels)  # Calculate loss
          y_pred_softmax = torch.log_softmax(outputs, dim=1)
          _, pred = torch.max(y_pred_softmax, dim=1)


      # Loss per batch is accumulated
      running_loss += loss.item() * inputs.size(0)
      #For accuracy
      running_corrects += torch.sum(pred == labels)

      pred_labels += pred.detach().cpu().numpy().tolist()
      targets +=  labels.detach().cpu().numpy().tolist()


  evaluate(phase, running_loss, running_corrects, targets, pred_labels)



In [31]:
# Model was trained for three epochs on three days separately.

no_epochs = 1

for epoch in range(no_epochs):
    since = time.time()
    print("Epoch : {}/{}".format(epoch + 1, no_epochs), flush=True)

    for phase in ['Train', 'Val']:
        print(phase + " begins")
        if phase == 'Train':
            net.train()
            net = train_one_epoch(phase, net, batch_data_loader[phase])
        else:  # For validation
            net.eval()
            test_model(phase, net, batch_data_loader[phase])


    torch.save(net.state_dict(), model_dir + 'res50_fruit_ep3.pth')

    time_elapsed = time.time() - since
    print("Time elapsed in {}".format(time_elapsed), flush=True)

    print("-" * 10)

Epoch : 1/1
Train begins
Train Iteration :408/407Val begins
Val Iteration :408/407
Val Loss :0.020316148123122265, Recall : 1.0, Precision : 1.0




Time elapsed in 767.0186641216278
----------


In [33]:
phase = "Test"
net.eval()
test_model(phase, net, batch_data_loader[phase])

Test Iteration :408/427
Test Loss :0.019569517374473926, Recall : 0.9999233716475096, Precision : 0.9999233716475096


