<a href="https://colab.research.google.com/github/geraldmc/torch-draft-final_project/blob/main/load_deepweeds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## K-FOLDS

### Download the code from Github

In [1]:
import os

if os.path.isfile("../main.zip"):
  print ('Have already downloaded the project file, continuing...')
  print()
else:
  print ('Downloading file...')
  ! wget https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip
  ! unzip -qq main.zip
  %cd torch-draft-final_project-main

Downloading file...
--2022-03-29 22:06:09--  https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/geraldmc/torch-draft-final_project/zip/refs/heads/main [following]
--2022-03-29 22:06:09--  https://codeload.github.com/geraldmc/torch-draft-final_project/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 52.193.111.178
Connecting to codeload.github.com (codeload.github.com)|52.193.111.178|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [ <=>                ] 526.76K  2.65MB/s    in 0.2s    

2022-03-29 22:06:11 (2.65 MB/s) - ‘main.zip’ saved [539403]

/content/torch-draft-final_project-main


### Import the project.

In [None]:
try:
  import conf.params as params
except ImportError:
  pass

### Download the dataset from Drive.

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')
print("Downloading DeepWeeds images to " + params.IMG_ZIP_FILE)
!cp '{params.GD_ZIP_IMG}' '{params.IMG_ZIP_FILE}'
print()
!ls -lart {params.IMG_ZIP_FILE}

Mounted at /content/gdrive
Downloading DeepWeeds images to /content/torch-draft-final_project-main/data/images.zip

-rw------- 1 root root 491516047 Mar 29 22:07 /content/torch-draft-final_project-main/data/images.zip


### Unzip the data files.

In [None]:
from zipfile import ZipFile

print("Unzipping DeepWeeds images into " + params.IMG_DIRECTORY)

with ZipFile(params.IMG_ZIP_FILE, "r") as zip_ref:
    zip_ref.extractall(params.IMG_DIRECTORY)

img_list=os.listdir(params.IMG_DIRECTORY)
print(len(img_list))

In [7]:
LABEL_PATH = os.path.join(params.DATA_PATH, 'labels')
!ls {LABEL_PATH}

labels.csv	  test_subset3.csv   train_subset2.csv	val_subset1.csv
test_subset0.csv  test_subset4.csv   train_subset3.csv	val_subset2.csv
test_subset1.csv  train_subset0.csv  train_subset4.csv	val_subset3.csv
test_subset2.csv  train_subset1.csv  val_subset0.csv	val_subset4.csv


In [8]:
import pandas as pd
label_df = pd.read_csv(os.path.join(LABEL_PATH, 'labels.csv'))
None

Unnamed: 0,Filename,Label,Species
0,20160928-140314-0.jpg,0,Chinee apple
1,20160928-140337-0.jpg,0,Chinee apple
2,20160928-140731-0.jpg,0,Chinee apple
3,20160928-140747-0.jpg,0,Chinee apple
4,20160928-141107-0.jpg,0,Chinee apple
...,...,...,...
17504,20180322-133822-1.jpg,8,Negative
17505,20180322-133832-1.jpg,8,Negative
17506,20180322-133840-1.jpg,8,Negative
17507,20180322-133850-1.jpg,8,Negative


## To run k-folds

##### 0) Combine train, test, val files.

    1) Copy files to their respective directories (for ImageFolder).
    2) Instantiate data loaders for each k-fold.
    3) Init a new ResNet50 model for each k-fold.
    4) Get/set the parameters to be optimized/updated for each k-fold.
    5) Train the model for each k-fold. Save best model.
    6) Delete contents of the train/val directories.
    7) REPEAT 1-7.

In [None]:
# 1) Combine train, test, val files. Random sample from the combined dataframes.

import glob 
import shutil

joined_val = os.path.join("data/", "labels/", "val*.csv")
joined_train = os.path.join("data/", "labels/", "train*.csv")
joined_test = os.path.join("data/", "labels/", "test*.csv")

val_files = glob.glob(joined_val)
train_files = glob.glob(joined_train)
test_files = glob.glob(joined_test)

train_df = pd.concat(map(pd.read_csv, train_files), ignore_index=True)
val_df = pd.concat(map(pd.read_csv, val_files), ignore_index=True)
test_df = pd.concat(map(pd.read_csv, test_files), ignore_index=True)

def sample_data(train_sample_no, val_sample_no, test_sample_no): 
    train = train_df.sample(n=train_sample_no)
    val = val_df.sample(n=val_sample_no)
    test = test_df.sample(n=test_sample_no)
    return train, val, test

def get_file_list():
    files = []
    for dirpath, dirnames, filenames in os.walk(params.IMAGE_PATH):
        for file in filenames:
            files.append(file)
    return files

def copy_files(df, filepath):

  labels = dict(zip(df.Filename, df.Label)) 
  for f in files:
      try:
          src = os.path.join(params.IMG_DIRECTORY, f)
          dst = os.path.join(filepath, str(labels[f]), f)
          shutil.copyfile(src, dst)
      except KeyError:
          pass

In [None]:
# 2) Copy files to their respective directories, for ImageFolder.

# Each fold contains 10,505 samples from the total
sample_train_df = train_df.sample(n=10505)
sample_val_df = val_df.sample(n=3502)
sample_test_df = test_df.sample(n=3502) 

copy_files(sample_train_df, params.IMG_TRAIN_PATH)
copy_files(sample_val_df, params.IMG_VAL_PATH)
copy_files(label_df, params.IMG_CLASSES) # this df holds the unsegregated files.

In [None]:
# 3) Instantiate data loaders for one k-fold.

from data import transforms as tsf
import torch

from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision.datasets import ImageFolder

train_data = ImageFolder(
    root=params.IMG_TRAIN_PATH, 
    transform=tsf.base_transform)
train_loader = DataLoader(train_data, 
    batch_size=32, shuffle=True, 
    num_workers=2)

val_data = ImageFolder(
    root=params.IMG_VAL_PATH, 
    transform=tsf.base_transform)
val_loader = DataLoader(val_data, 
    batch_size=32, shuffle=True, 
    num_workers=2)

print("Initializing Datasets and Dataloaders...")

# Create train and val datasets
image_datasets = {x: ImageFolder(os.path.join(params.DATA_PATH, x), 
                                 tsf.paired_transforms[x]) 
                                  for x in ['train', 'val']}

# Create train and val dataloaders
dataloaders_dict = {x: DataLoader(image_datasets[x], 
                                  batch_size=params.BATCH_SIZE, 
                                  shuffle=True, num_workers=2) 
                                  for x in ['train', 'val']}

In [None]:
# 4) Init a new ResNet50 model.
import torch.nn as nn


def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
     # STEP 4 - Init a new ResNet50 model (called below)
    model_ft = None
    input_size = 0

    if model_name == "resnet50":
        """ Resnet50
        """
        model_ft = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    else:
        print("Invalid model name, exiting...")
        exit()

    return model_ft, input_size


def init_model():
    model, input_size = initialize_model('resnet50', params.NUM_CLASSES, 
                                            feature_extract=True, use_pretrained=True)
    if torch.cuda.is_available():
        model.to('cuda') #IMPORTANT!
    
    return model, input_size

In [None]:
#5) Get/set the parameters to be optimized/updated for each k-fold.

import torch.optim as optim

def get_parameters(model, features):
    # Only parameters that we've just initialized, i.e. the parameters with 
    # requires_grad is True, are updated. (i.e. the last fc layer).

    params_to_update = model.parameters()

    print("Params to learn:")
    if features:
        params_to_update = []
        for name,param in model.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                print("\t",name)
    else:
        for name,param in model.named_parameters():
            if param.requires_grad == True:
                print("\t",name)

    # Observe that all parameters are optimized
    # optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

    opt = optim.Adam(params_to_update, lr=1e-3)
    sch = optim.lr_scheduler.ReduceLROnPlateau(
        opt, patience=16, factor=0.5, min_lr=0.000003125)

    return opt, sch


In [None]:
import time
import copy


def train_model(model, dataloaders, criterion, optimizer, num_epochs):
    since = time.time()

    # lists to store per-epoch loss and accuracy values
    val_acc_history, val_loss_history = [], []
    train_acc_history, train_loss_history = [], []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(params.DEVICE)
                labels = labels.to(params.DEVICE)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
                val_loss_history.append(epoch_loss)
            elif phase == 'train':
                train_acc_history.append(epoch_acc)
                train_loss_history.append(epoch_loss)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history, val_loss_history, train_acc_history, train_loss_history

### Encapsulate Training and Reporting

In [None]:
import time
import copy


def train_model(model, dataloaders, criterion, optimizer, num_epochs):
    since = time.time()

    # lists to store per-epoch loss and accuracy values
    val_acc_history, val_loss_history = [], []
    train_acc_history, train_loss_history = [], []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(params.DEVICE)
                labels = labels.to(params.DEVICE)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
                val_loss_history.append(epoch_loss)
            elif phase == 'train':
                train_acc_history.append(epoch_acc)
                train_loss_history.append(epoch_loss)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history, val_loss_history, train_acc_history, train_loss_history

## 6) Train the model.

In [None]:
def run_kfold(model, dataloaders, crtierion, optimizer, epochs):

    criterion = nn.CrossEntropyLoss() # same as DeepWeeds (i.e. binary_crossentropy)
    model, val_acc, val_loss, train_acc, train_loss = train_model(
                                        model, dataloaders, 
                                        criterion, optimizer, 
                                        num_epochs=epochs)

    return model, val_acc, val_loss, train_acc, train_loss

### 7) Delete files from their respective directories, start again on the next fold.

In [None]:
import os

def delete_class_files(path):
  for file_name in os.listdir(path):
      file = path + file_name
      #print(file)
      if os.path.isfile(file):
          os.remove(file)

### Print Stats

In [None]:
# Convert tensor objects to lists
val_acc_record = [val_acc[x].item() for x in range(len(val_acc))]
val_loss_record = [val_loss[x] for x in range(len(val_loss))]
train_acc_record = [train_acc[x].item() for x in range(len(train_acc))]
train_loss_record = [train_loss[x] for x in range(len(train_loss))]

# Accuracy plots
plt.figure(figsize=(8, 6))
plt.plot(train_acc_record, color='green', label='train acc')
plt.plot(val_acc_record, color='blue', label='val acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
#plt.savefig(f"../outputs/{acc_plot_name}.png")
plt.show()

# Loss plots
plt.figure(figsize=(8, 6))
plt.plot(train_loss_record, color='orange', label='train loss')
plt.plot(val_loss_record, color='red', label='val loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
#plt.savefig(f"../outputs/{loss_plot_name}.png")
plt.show()

# Train acc versus loss
plt.figure(figsize=(8, 6))
plt.plot(train_acc_record, color='blue', label='train acc')
plt.plot(train_loss_record, color='green', label='train loss')
plt.xlabel('Epochs')
plt.ylabel('Accuracy/Loss')
plt.legend()
#plt.savefig(f"../outputs/{acc_plot_name}.png")
plt.show()

# Val acc versus loss
plt.figure(figsize=(8, 6))
plt.plot(val_acc_record, color='red', label='val acc')
plt.plot(val_loss_record, color='orange', label='val loss')
plt.xlabel('Epochs')
plt.ylabel('Accuracy/Loss')
plt.legend()
#plt.savefig(f"../outputs/{acc_plot_name}.png")
plt.show()

### Functions to checkpoint and save/load models.

In [None]:
def checkpoint(epoch, nn, opt, LOSS, MODEL_PATH ):
    torch.save({
                'epoch': EPOCH,
                'model_state_dict': nn.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': LOSS,
                }, MODEL_PATH)

def print_states(m, opt):
    # Print the model's state_dict
    print("Model's state_dict:")
    for param_tensor in m.state_dict():
        print(param_tensor, "\t", m.state_dict()[param_tensor].size())

    # Print optimizer's state_dict
    print("Optimizer's state_dict:")
    for var_name in opt.state_dict():
        print(var_name, "\t", opt.state_dict()[var_name])

def save_model(m):
    torch.save(m.state_dict(), os.path.join(params.OUTPUT_PATH, 'model.pth'))

def load_model():
    model.load_state_dict(torch.load(params.OUTPUT_PATH))
    return model

In [None]:
''' GUTTER ------ 
'''