<a href="https://colab.research.google.com/github/geraldmc/torch-draft-final_project/blob/main/load_deepweeds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os 
import time
from datetime import datetime
import glob 
import shutil
import copy
from zipfile import ZipFile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torch.utils.data import RandomSampler, random_split
from torch.utils.data import SubsetRandomSampler, WeightedRandomSampler
from torchvision.datasets import ImageFolder

### Download the code from Github

In [1]:
import os

if os.path.isfile("../main.zip"):
  print ('Have already downloaded the project file, continuing...')
  print()
else:
  print ('Downloading file...')
  ! wget https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip
  ! unzip -qq main.zip
  %cd torch-draft-final_project-main

Downloading file...
--2022-03-29 22:06:09--  https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/geraldmc/torch-draft-final_project/zip/refs/heads/main [following]
--2022-03-29 22:06:09--  https://codeload.github.com/geraldmc/torch-draft-final_project/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 52.193.111.178
Connecting to codeload.github.com (codeload.github.com)|52.193.111.178|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [ <=>                ] 526.76K  2.65MB/s    in 0.2s    

2022-03-29 22:06:11 (2.65 MB/s) - ‘main.zip’ saved [539403]

/content/torch-draft-final_project-main


In [None]:
try:
  import conf.params as params
  from data import transforms as tsf
except ImportError:
  pass

### 1) Combine all train, test, val files, and random sample from the combined dataframes.

In [None]:
import glob 

joined_val = os.path.join("data/", "labels/", "val*.csv")
joined_train = os.path.join("data/", "labels/", "train*.csv")
joined_test = os.path.join("data/", "labels/", "test*.csv")

val_files = glob.glob(joined_val)
train_files = glob.glob(joined_train)
test_files = glob.glob(joined_test)

train_df = pd.concat(map(pd.read_csv, train_files), ignore_index=True)
val_df = pd.concat(map(pd.read_csv, val_files), ignore_index=True)
test_df = pd.concat(map(pd.read_csv, test_files), ignore_index=True)

# In the paper, each fold contains 10,505 samples from the total
def sample_data(train_sample_no, val_sample_no, test_sample_no): 
    train = train_df.sample(n=train_sample_no)
    val = val_df.sample(n=val_sample_no)
    test = test_df.sample(n=test_sample_no)
    return train, val, test

### 2) Copy files to their respective directories, for ImageFolder.

In [None]:
import shutil

files = []
for dirpath, dirnames, filenames in os.walk(params.IMAGE_PATH):
    for file in filenames:
        files.append(file)

def copy_files(df, filepath):

  labels = dict(zip(df.Filename, df.Label)) 
  for f in files:
      try:
          src = os.path.join(params.IMG_DIRECTORY, f)
          dst = os.path.join(filepath, str(labels[f]), f)
          shutil.copyfile(src, dst)
      except KeyError:
          pass

# sample number same as paper.
sample_train_df = train_df.sample(n=10505)
sample_val_df = val_df.sample(n=3502)
sample_test_df = test_df.sample(n=3502) 

copy_files(sample_train_df, params.IMG_TRAIN_PATH)
copy_files(sample_val_df, params.IMG_VAL_PATH)
copy_files(label_df, params.IMG_CLASSES) # this holds all unsegregatd files.

### 3) Instantiate the data loaders for this k-fold.

In [None]:
from data import transforms as tsf
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision.datasets import ImageFolder

# Each training dataset contains 8382 x 5 images.

train_loader = DataLoader(
 ConcatDataset([ImageFolder(
          params.IMG_TRAIN_PATH, 
          transform=tsf.base_transform),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=tsf.data_translate),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=tsf.data_grayscale),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=tsf.data_rotate),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=tsf.data_jitter_hue)]), 
          batch_size=32, 
          shuffle=True, num_workers=2, 
          pin_memory=torch.cuda.is_available())

# Each validation dataset contains 3251 x 5 images.

val_loader = DataLoader(
 ConcatDataset([ImageFolder(
          params.IMG_VAL_PATH, 
          transform=tsf.base_transform),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=tsf.data_translate),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=tsf.data_grayscale),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=tsf.data_rotate),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=tsf.data_jitter_hue)]), 
          batch_size=32, 
          shuffle=False, num_workers=2, 
          pin_memory=torch.cuda.is_available())

dataloaders_augment = {}
dataloaders_augment['train'] = train_loader
dataloaders_augment['val'] = val_loader

In [None]:
import os

def delete_class_files(path):
  for file_name in os.listdir(path):
      file = path + file_name
      #print(file)
      if os.path.isfile(file):
          os.remove(file)