<a href="https://colab.research.google.com/github/geraldmc/torch-draft-final_project/blob/main/concat_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os 
import time
from datetime import datetime
import glob 
import shutil
import copy
from zipfile import ZipFile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torch.utils.data import RandomSampler, random_split
from torch.utils.data import SubsetRandomSampler, WeightedRandomSampler
from torchvision.datasets import ImageFolder

# Look at cells 8, 9, and 10. 

### Download the code from Github

In [2]:
import os

if os.path.isfile("../main.zip"):
  print ('Have already downloaded the project file, continuing...')
  print()
else:
  print ('Downloading file...')
  ! wget https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip
  ! unzip -qq main.zip
  %cd torch-draft-final_project-main

Downloading file...
--2022-04-22 21:58:06--  https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/geraldmc/torch-draft-final_project/zip/refs/heads/main [following]
--2022-04-22 21:58:06--  https://codeload.github.com/geraldmc/torch-draft-final_project/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 192.30.255.120
Connecting to codeload.github.com (codeload.github.com)|192.30.255.120|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [  <=>               ]   2.59M  8.69MB/s    in 0.3s    

2022-04-22 21:58:06 (8.69 MB/s) - ‘main.zip’ saved [2721058]

/content/torch-draft-final_project-main


In [3]:
try:
  import conf.params as params
  from data import transforms as tsf
except ImportError:
  pass

In [4]:
from google.colab import drive

drive.mount('/content/gdrive')
print()
print("Downloading DeepWeeds images to " + params.IMG_ZIP_FILE)
!cp '{params.GD_ZIP_IMG}' '{params.IMG_ZIP_FILE}'
print()
!ls -lart {params.IMG_ZIP_FILE}

print()
print("Downloading GAN images to " + params.GAN_ZIP_FILE)
!cp '{params.GD_ZIP_GAN}' '{params.GAN_ZIP_FILE}'
print()
!ls -lart {params.GAN_ZIP_FILE}

Mounted at /content/gdrive

Downloading DeepWeeds images to data/images.zip

-rw------- 1 root root 491516047 Apr 22 21:58 data/images.zip

Downloading GAN images to data/gans.zip

-rw------- 1 root root 53284865 Apr 22 21:58 data/gans.zip


In [5]:
print("[INFO] Unzipping DeepWeeds images into " +  params.IMG_DIRECTORY)

with ZipFile(params.IMG_ZIP_FILE, "r") as zip_ref:
    zip_ref.extractall(params.IMG_DIRECTORY)

img_list=os.listdir(params.IMG_DIRECTORY)
print(len(img_list))

print()
print("[INFO] Unzipping GAN image dirs into " + params.DATA_PATH)

with ZipFile(params.GAN_ZIP_FILE, "r") as zip_ref:
    zip_ref.extractall(params.DATA_PATH)

gan_dir_list=os.listdir(params.DATA_PATH+'/gans/train/0')
print(len(gan_dir_list))

[INFO] Unzipping DeepWeeds images into data/images
17510

[INFO] Unzipping GAN image dirs into data
750


### 1) Combine train, test, val files, respectively.

In [6]:
import glob 

joined_val = os.path.join("data/", "labels/", "val*.csv")
joined_train = os.path.join("data/", "labels/", "train*.csv")
joined_test = os.path.join("data/", "labels/", "test*.csv")

val_files = glob.glob(joined_val)
train_files = glob.glob(joined_train)
test_files = glob.glob(joined_test)

train_df = pd.concat(map(pd.read_csv, train_files), ignore_index=True)
val_df = pd.concat(map(pd.read_csv, val_files), ignore_index=True)
test_df = pd.concat(map(pd.read_csv, test_files), ignore_index=True)

# In the paper, each fold contains 10,505 samples from the total
def sample_data(train_sample_no, val_sample_no, test_sample_no): 
    train = train_df.sample(n=train_sample_no)
    val = val_df.sample(n=val_sample_no)
    test = test_df.sample(n=test_sample_no)
    return train, val, test

### 2) Copy files to their respective directories.

In [7]:
import shutil

files = []
for dirpath, dirnames, filenames in os.walk(params.IMAGE_PATH):
    for file in filenames:
        files.append(file)

def copy_files(df, filepath):

  labels = dict(zip(df.Filename, df.Label)) 
  for f in files:
      try:
          src = os.path.join(params.IMG_DIRECTORY, f)
          dst = os.path.join(filepath, str(labels[f]), f)
          shutil.copyfile(src, dst)
      except KeyError:
          pass

# sample number same as paper.
sample_train_df = train_df.sample(n=10505)
sample_val_df = val_df.sample(n=3502)
sample_test_df = test_df.sample(n=3502) 

copy_files(sample_train_df, params.IMG_TRAIN_PATH)
copy_files(sample_val_df, params.IMG_VAL_PATH)

### 3) Instantiate the data loaders.

In [8]:
from data import transforms as tsf
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision.datasets import ImageFolder

# Each training dataset contains 8382 x 5 images.

train_loader = DataLoader(
 ConcatDataset([ImageFolder(
          params.IMG_TRAIN_PATH, 
          transform=tsf.base_transform),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=tsf.data_translate),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=tsf.data_grayscale),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=tsf.data_rotate),
     ImageFolder(
          params.IMG_TRAIN_PATH,
          transform=tsf.data_jitter_hue)]), 
          batch_size=32, 
          shuffle=True, num_workers=2, 
          pin_memory=torch.cuda.is_available())

# Each validation dataset contains 3251 x 5 images.

val_loader = DataLoader(
 ConcatDataset([ImageFolder(
          params.IMG_VAL_PATH, 
          transform=tsf.base_transform),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=tsf.data_translate),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=tsf.data_grayscale),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=tsf.data_rotate),
     ImageFolder(
          params.IMG_VAL_PATH,
          transform=tsf.data_jitter_hue)]), 
          batch_size=32, 
          shuffle=False, num_workers=2, 
          pin_memory=torch.cuda.is_available())

dataloaders_augment = {}
dataloaders_augment['train'] = train_loader
dataloaders_augment['val'] = val_loader

print("Cumulative length of the train dataloaders:", dataloaders_augment['train'].dataset.cumulative_sizes)
print("Cumulative length of the val dataloaders:", dataloaders_augment['val'].dataset.cumulative_sizes)

Cumulative length of the train dataloaders: [8343, 16686, 25029, 33372, 41715]
Cumulative length of the val dataloaders: [3256, 6512, 9768, 13024, 16280]


In [9]:
val_loader.dataset.datasets

[Dataset ImageFolder
     Number of datapoints: 3256
     Root location: data/val
     StandardTransform
 Transform: Compose(
                Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
                ToTensor()
                Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
            ), Dataset ImageFolder
     Number of datapoints: 3256
     Root location: data/val
     StandardTransform
 Transform: Compose(
                Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
                RandomAffine(degrees=[-15.0, 15.0], translate=(0.1, 0.1))
                ToTensor()
                Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
            ), Dataset ImageFolder
     Number of datapoints: 3256
     Root location: data/val
     StandardTransform
 Transform: Compose(
                Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
                Gra

In [10]:
train_loader.dataset.datasets

[Dataset ImageFolder
     Number of datapoints: 8343
     Root location: data/train
     StandardTransform
 Transform: Compose(
                Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
                ToTensor()
                Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
            ), Dataset ImageFolder
     Number of datapoints: 8343
     Root location: data/train
     StandardTransform
 Transform: Compose(
                Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
                RandomAffine(degrees=[-15.0, 15.0], translate=(0.1, 0.1))
                ToTensor()
                Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
            ), Dataset ImageFolder
     Number of datapoints: 8343
     Root location: data/train
     StandardTransform
 Transform: Compose(
                Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
             

In [12]:
import os

def delete_train_val_files(path):
  for sub_dir in sorted(os.listdir(path)):
    for file_name in os.listdir(os.path.join(path, sub_dir)):
      file = os.path.join(path, sub_dir, file_name)
      if os.path.isfile(file):
        os.remove(file)

### Delete all files in Colab file system. 

In [13]:
delete_train_val_files(params.IMG_TRAIN_PATH)
delete_train_val_files(params.IMG_VAL_PATH)

