In [5]:
import os
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
import urllib.request as req
plt.style.use('fivethirtyeight')

## Download data

In [6]:
data_URL = "https://download.pytorch.org/tutorial/hymenoptera_data.zip"

In [7]:
# create a directory
def create_dirs(dir_path):
    os.makedirs(dir_path, exist_ok=True)
    print(f"{dir_path} directory created")
    
ROOT_DATA_DIR = "hymenoptera_data"
create_dirs(ROOT_DATA_DIR)

hymenoptera_data directory created


In [9]:
data_zip_file = "data.zip"
data_zip_path = os.path.join(ROOT_DATA_DIR, data_zip_file)

if not os.path.isfile(data_zip_file):
    print("downloading data...")
    filename, headers = req.urlretrieve(data_URL, data_zip_path)
    print(f"filename: {filename} created with info \n{headers}")
else:
    print(f"file is already present")

downloading data...
filename: hymenoptera_data\data.zip created with info 
Content-Type: application/zip
Content-Length: 47286322
Connection: close
Date: Fri, 11 Mar 2022 11:26:26 GMT
Last-Modified: Wed, 15 Mar 2017 18:46:00 GMT
ETag: "5f8c32a6554f6acb4d649776e7735e48"
x-amz-version-id: null
Accept-Ranges: bytes
Server: AmazonS3
X-Cache: Miss from cloudfront
Via: 1.1 ecfda1b7359bd66eb2625616364a7174.cloudfront.net (CloudFront)
X-Amz-Cf-Pop: BLR50-C1
X-Amz-Cf-Id: lKA-dKlvyudX3FR72PKWIKQpcq0yjmmbjpebl7WM_IThzeU7NRM-Hg==




## Unzip data

In [15]:
from zipfile import ZipFile

unzip_data_dirname = "unzip_data_dir"
unzip_data_dir = os.path.join(ROOT_DATA_DIR, unzip_data_dirname)

if not os.path.exists(unzip_data_dir):
    os.makedirs(unzip_data_dir, exist_ok=True)
    with ZipFile(data_zip_path) as f:
        f.extractall(unzip_data_dir)
else:
    print(f"data already extacted")

## Create data loaders

In [24]:
from pathlib import Path

In [25]:
train_path = Path("hymenoptera_data/unzip_data_dir/hymenoptera_data/train")
test_path = Path("hymenoptera_data/unzip_data_dir/hymenoptera_data/val")

In [26]:
img_size = (224, 224)

In [28]:
mean = torch.tensor([0.5, 0.5, 0.5])
std = torch.tensor([0.5, 0.5, 0.5])

### Transformations

In [30]:
train_transforms = transforms.Compose([
    transforms.Resize(img_size),
    transforms.RandomRotation(degrees=20),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_transforms = transforms.Compose([
    transforms.Resize(img_size),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

In [32]:
train_data = datasets.ImageFolder(root=train_path, transform=train_transforms)
test_data = datasets.ImageFolder(root=test_path, transform=test_transforms)

In [33]:
train_data.class_to_idx

{'ants': 0, 'bees': 1}

In [35]:
label_map = train_data.class_to_idx
label_map

{'ants': 0, 'bees': 1}

In [36]:
train_data

Dataset ImageFolder
    Number of datapoints: 244
    Root location: hymenoptera_data\unzip_data_dir\hymenoptera_data\train
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
               RandomRotation(degrees=[-20.0, 20.0], interpolation=nearest, expand=False, fill=0)
               ToTensor()
               Normalize(mean=tensor([0.5000, 0.5000, 0.5000]), std=tensor([0.5000, 0.5000, 0.5000]))
           )

In [37]:
batch_size = 64

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [38]:
data = next(iter(train_loader))

In [39]:
len(data)

2

In [40]:
images, labels = data

In [41]:
images.shape

torch.Size([64, 3, 224, 224])

In [42]:
labels.shape

torch.Size([64])