<a href="https://colab.research.google.com/github/geraldmc/torch-draft-final_project/blob/main/concat_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os 
import time
from datetime import datetime
import glob 
import shutil
import copy
from zipfile import ZipFile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torch.utils.data import RandomSampler, random_split
from torch.utils.data import SubsetRandomSampler, WeightedRandomSampler
from torchvision.datasets import ImageFolder

### Download the code from Github

In [2]:
import os

if os.path.isfile("../main.zip"):
  print ('Have already downloaded the project file, continuing...')
  print()
else:
  print ('Downloading file...')
  ! wget https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip
  ! unzip -qq main.zip
  %cd torch-draft-final_project-main

Downloading file...
--2022-04-23 15:45:54--  https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/geraldmc/torch-draft-final_project/zip/refs/heads/main [following]
--2022-04-23 15:45:54--  https://codeload.github.com/geraldmc/torch-draft-final_project/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 140.82.114.10
Connecting to codeload.github.com (codeload.github.com)|140.82.114.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [  <=>               ]   3.01M  9.62MB/s    in 0.3s    

2022-04-23 15:45:55 (9.62 MB/s) - ‘main.zip’ saved [3161009]

/content/torch-draft-final_project-main


In [3]:
try:
  import conf.params as params
  #from data import transforms as tsf
except ImportError:
  pass

In [4]:
from google.colab import drive

drive.mount('/content/gdrive')
print()
print("Downloading DeepWeeds images to " + params.IMG_ZIP_FILE)
!cp '{params.GD_ZIP_IMG}' '{params.IMG_ZIP_FILE}'
print()
!ls -lart {params.IMG_ZIP_FILE}

print()
print("Downloading GAN images to " + params.GAN_ZIP_FILE)
!cp '{params.GD_ZIP_GAN}' '{params.GAN_ZIP_FILE}'
print()
!ls -lart {params.GAN_ZIP_FILE}

Mounted at /content/gdrive

Downloading DeepWeeds images to data/images.zip

-rw------- 1 root root 491516047 Apr 23 15:46 data/images.zip

Downloading GAN images to data/gans.zip

-rw------- 1 root root 53284865 Apr 23 15:46 data/gans.zip


In [5]:
print("[INFO] Unzipping DeepWeeds images into " +  params.IMG_DIRECTORY)

with ZipFile(params.IMG_ZIP_FILE, "r") as zip_ref:
    zip_ref.extractall(params.IMG_DIRECTORY)

img_list=os.listdir(params.IMG_DIRECTORY)
print(len(img_list))

print()
print("[INFO] Unzipping GAN image dirs into " + params.DATA_PATH)

with ZipFile(params.GAN_ZIP_FILE, "r") as zip_ref:
    zip_ref.extractall(params.DATA_PATH)

gan_dir_list=os.listdir(params.DATA_PATH+'/gans/train/0')
print(len(gan_dir_list))

[INFO] Unzipping DeepWeeds images into data/images
17510

[INFO] Unzipping GAN image dirs into data
750


### 1) Combine train, test, val files, respectively.

In [6]:
import glob 

joined_val = os.path.join("data/", "labels/", "val*.csv")
joined_train = os.path.join("data/", "labels/", "train*.csv")
joined_test = os.path.join("data/", "labels/", "test*.csv")

val_files = glob.glob(joined_val)
train_files = glob.glob(joined_train)
test_files = glob.glob(joined_test)

train_df = pd.concat(map(pd.read_csv, train_files), ignore_index=True)
val_df = pd.concat(map(pd.read_csv, val_files), ignore_index=True)
test_df = pd.concat(map(pd.read_csv, test_files), ignore_index=True)

# In the paper, each fold contains 10,505 samples from the total
def sample_data(train_sample_no, val_sample_no, test_sample_no): 
    train = train_df.sample(n=train_sample_no)
    val = val_df.sample(n=val_sample_no)
    test = test_df.sample(n=test_sample_no)
    return train, val, test

### 2) Copy files to their respective directories.

In [7]:
import shutil

files = []
for dirpath, dirnames, filenames in os.walk(params.IMAGE_PATH):
    for file in filenames:
        files.append(file)

def copy_files(df, filepath):

  labels = dict(zip(df.Filename, df.Label)) 
  for f in files:
      try:
          src = os.path.join(params.IMG_DIRECTORY, f)
          dst = os.path.join(filepath, str(labels[f]), f)
          shutil.copyfile(src, dst)
      except KeyError:
          pass

# sample number same as paper.
sample_train_df = train_df.sample(n=10505)
sample_val_df = val_df.sample(n=3502)
sample_test_df = test_df.sample(n=3502) 

copy_files(sample_train_df, params.IMG_TRAIN_PATH)
copy_files(sample_val_df, params.IMG_VAL_PATH)

In [8]:
!pip install git+https://github.com/aleju/imgaug

Collecting git+https://github.com/aleju/imgaug
  Cloning https://github.com/aleju/imgaug to /tmp/pip-req-build-ug3k5ryq
  Running command git clone -q https://github.com/aleju/imgaug /tmp/pip-req-build-ug3k5ryq
Building wheels for collected packages: imgaug
  Building wheel for imgaug (setup.py) ... [?25l[?25hdone
  Created wheel for imgaug: filename=imgaug-0.4.0-py3-none-any.whl size=971122 sha256=822f823586c4626cff22e71a7b74ea78d43035b3c141f4dcd7d69a842bb95190
  Stored in directory: /tmp/pip-ephem-wheel-cache-gzze4xf6/wheels/24/09/69/f6547987407c2e85f9923e8e1189167fec80074ee5c6fe6ebd
Successfully built imgaug
Installing collected packages: imgaug
  Attempting uninstall: imgaug
    Found existing installation: imgaug 0.2.9
    Uninstalling imgaug-0.2.9:
      Successfully uninstalled imgaug-0.2.9
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumenta

In [9]:
from imgaug import augmenters as iaa
import imgaug as ia

In [10]:
class ImgAugTransform:
  def __init__(self):
    self.aug = iaa.Sequential([
        iaa.Resize((224, 224)),
        iaa.Sometimes(0.25, iaa.GaussianBlur(sigma=(0, 3.0))),
        iaa.Fliplr(0.5),
        iaa.Affine(rotate=(-20, 20), mode='symmetric'),
        iaa.Sometimes(0.25,
                      iaa.OneOf([iaa.Dropout(p=(0, 0.1)),
                                 iaa.CoarseDropout(0.1, size_percent=0.5)])),
        iaa.AddToHueAndSaturation(value=(-10, 10), per_channel=True)
    ])
      
  def __call__(self, img):
    img = np.array(img)
    return self.aug.augment_image(img)

transforms = ImgAugTransform()

### 3) Instantiate the data loaders.

In [11]:
#from data import transforms as tsf
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision.datasets import ImageFolder

# Each training dataset contains 8382 x 5 images.

train_loader = DataLoader(
 ConcatDataset([ImageFolder(
          params.IMG_TRAIN_PATH, 
          transform=transforms),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=transforms),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=transforms),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=transforms),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=transforms)]), 
          batch_size=1, 
          shuffle=True, num_workers=2, 
          pin_memory=torch.cuda.is_available())

# Each validation dataset contains 3251 x 5 images.

val_loader = DataLoader(
 ConcatDataset([ImageFolder(
          params.IMG_VAL_PATH, 
          transform=transforms),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=transforms),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=transforms),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=transforms),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=transforms)]), 
          batch_size=1, 
          shuffle=False, num_workers=2, 
          pin_memory=torch.cuda.is_available())

dataloaders_augment = {}
dataloaders_augment['train'] = train_loader
dataloaders_augment['val'] = val_loader

print("Cumulative length of the train dataloaders:", dataloaders_augment['train'].dataset.cumulative_sizes)
print("Cumulative length of the val dataloaders:", dataloaders_augment['val'].dataset.cumulative_sizes)

Cumulative length of the train dataloaders: [8416, 16832, 25248, 33664, 42080]
Cumulative length of the val dataloaders: [3257, 6514, 9771, 13028, 16285]


In [12]:
val_loader.dataset.datasets

[Dataset ImageFolder
     Number of datapoints: 3257
     Root location: data/val
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>,
 Dataset ImageFolder
     Number of datapoints: 3257
     Root location: data/val
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>,
 Dataset ImageFolder
     Number of datapoints: 3257
     Root location: data/val
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>,
 Dataset ImageFolder
     Number of datapoints: 3257
     Root location: data/val
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>,
 Dataset ImageFolder
     Number of datapoints: 3257
     Root location: data/val
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>]

In [13]:
train_loader.dataset.datasets

[Dataset ImageFolder
     Number of datapoints: 8416
     Root location: data/train
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>,
 Dataset ImageFolder
     Number of datapoints: 8416
     Root location: data/train
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>,
 Dataset ImageFolder
     Number of datapoints: 8416
     Root location: data/train
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>,
 Dataset ImageFolder
     Number of datapoints: 8416
     Root location: data/train
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>,
 Dataset ImageFolder
     Number of datapoints: 8416
     Root location: data/train
     StandardTransform
 Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>]

In [14]:
#dataloaders_augment['train'] # dataloader_obj
single_dataset = dataloaders_augment['train'].dataset.datasets[1]
single_dataset

Dataset ImageFolder
    Number of datapoints: 8416
    Root location: data/train
    StandardTransform
Transform: <__main__.ImgAugTransform object at 0x7f53b09e4b10>

In [15]:
train_loader_single = DataLoader(single_dataset, 
    batch_size=1, shuffle=True, # note that batch size is 1, to make count work. 
    num_workers=2)

In [16]:
idx2class = {v: k for k, v in single_dataset.class_to_idx.items()}

def get_class_distribution_loader(dataloader_obj, dataset_obj):
    count_dict = {k:0 for k,v in dataset_obj.class_to_idx.items()}
    
    for _, j in dataloader_obj:
        y_idx = j.item()
        y_lbl = idx2class[y_idx]
        count_dict[str(y_lbl)] += 1
            
    return count_dict

In [17]:
get_class_distribution_loader(train_loader_single, single_dataset)

{'0': 515,
 '1': 543,
 '2': 519,
 '3': 508,
 '4': 507,
 '5': 479,
 '6': 500,
 '7': 481,
 '8': 4364}

In [18]:
import os

def delete_train_val_files(path):
  for sub_dir in sorted(os.listdir(path)):
    for file_name in os.listdir(os.path.join(path, sub_dir)):
      file = os.path.join(path, sub_dir, file_name)
      if os.path.isfile(file):
        os.remove(file)

### Delete all files in Colab file system. 

In [19]:
delete_train_val_files(params.IMG_TRAIN_PATH)
delete_train_val_files(params.IMG_VAL_PATH)

