# Day 3: Data augementation 

Purpose of this notebook is to train a model with data augmentation. I will be using best parameters from previous notebook and will be using data augmentation to improve the model.
* make sweep with different data augementation types
* make seperate training file but do preprocessing, visualization and post processing in this file
* make a seperate file for model training
* make utility library for common functions and install with pip 

Imports:

In [3]:
import MyUtils.Util.Misc as util


In [None]:
import keras
import numpy as np
import tensorflow as tf
import os
import wandb
import IPython.display as display
import gc
import pandas as pd
import kaggle
import torch
import zipfile
from sklearn.model_selection import train_test_split
from omegaconf import OmegaConf


Check GPU:

In [2]:
# throw error if no gpu available
assert torch.cuda.is_available(), "No GPU available"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Cofiguration

In [3]:
cfg = util.load_and_override_config(".", "config")

cfg {'project_name': 'Day3', 'username': 'frizzerdk', 'is_sweep': False, 'preprocessed_data_path': './data/preprocessed', 'x_train_path': '${preprocessed_data_path}/x_train.npy', 'y_train_path': '${preprocessed_data_path}/y_train.npy', 'x_test_path': '${preprocessed_data_path}/x_test.npy', 'x_val_path': '${preprocessed_data_path}/x_val.npy', 'y_val_path': '${preprocessed_data_path}/y_val.npy', 'checkpoint_path': './checkpoints', 'best_model_path': '${checkpoint_path}/best_model.keras', 'num_classes': 10, 'input_shape': [28, 28, 1], 'activation': 'relu', 'epochs': 50, 'param_scale': 1.0, 'dropout_rate': 0.5, 'learning_rate': 0.001, 'batch_size': 128, 'patience': 20}


## Get data ready

dowload data and unzip it

In [4]:
# Define the directory where you want to download the data
data_dir = "MyDataset/mnist/raw"  # './' represents the current directory

# Check if the directory exists
if not os.path.exists(data_dir):
    # If not, create the directory
    os.makedirs(data_dir)

# Move to that directory
os.chdir(data_dir)
competition_name = "digit-recognizer"
# Download the data
os.system("kaggle competitions download -c " + competition_name)

# Unzip the data
with zipfile.ZipFile("digit-recognizer.zip","r") as zip_ref:
    zip_ref.extractall(".")

os.chdir("../../..")

digit-recognizer.zip: Skipping, found more recently modified local copy (use --force to force download)


Load data

In [5]:
# Load the dataset
train_df = pd.read_csv('MyDataset/mnist/raw/train.csv')
test_df = pd.read_csv('MyDataset/mnist/raw/test.csv')

# Split features and labels
y_train = train_df["label"]
x_train = train_df.drop(labels = ["label"], axis = 1)

# Convert to numpy arrays
x_train = x_train.values
y_train = y_train.values

# Test data
x_test = test_df.values

Inspect data

In [6]:
train_df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Prepare data

In [7]:
# split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.1, random_state=2)

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255

# Make sure images have shape (28, 28, 1)
x_train = x_train.reshape(-1, 28, 28, 1)
x_test = x_test.reshape(-1, 28, 28, 1)
x_val = x_val.reshape(-1, 28, 28, 1)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

x_train shape: (37800, 28, 28, 1)
y_train shape: (37800,)
37800 train samples
28000 test samples


Save preprocessed data for training

In [8]:
# Make sure dir exists
os.makedirs(cfg.preprocessed_data_path, exist_ok=True)
# Save the data
np.save(cfg.x_train_path, x_train,)
np.save(cfg.y_train_path, y_train)
np.save(cfg.x_val_path, x_val)
np.save(cfg.y_val_path, y_val)
np.save(cfg.x_test_path, x_test)
# free memory
del x_train
del y_train
del x_val
del y_val
del x_test
del train_df
del test_df
gc.collect()

192

## Run training Script

In [1]:
# Define the tmux session name and the list of scripts to execute
tmux_session_name = "devenv_session"
scripts = ["train.py", "train.py"]

# Check if the session already exists
existing_sessions = !tmux ls 2> /dev/null
session_exists = any(tmux_session_name in session for session in existing_sessions)

# If the session exists, kill it
if session_exists:
    print(f"Session '{tmux_session_name}' already exists. Killing the session.")
    !tmux kill-session -t {tmux_session_name}

# Start a new tmux session
print(f"Creating a new session: {tmux_session_name}")
!tmux new-session -d -s {tmux_session_name} -n main

# Select the first window
first_window_name = "main"

# Initialize the first pane with the first script
print(f"Running script {scripts[0]} in the first pane")
!tmux send-keys -t {tmux_session_name}:{first_window_name}.0 "python3 {scripts[0]}" Enter

# For subsequent scripts, create a new pane and execute the script
for i, script in enumerate(scripts[1:], start=1):
    print(f"Creating pane {i} for script {script}")
    # Split the window to add a new pane
    !tmux split-window -t {tmux_session_name}:{first_window_name}
    # Send the command to execute the script in the new pane
    !tmux send-keys -t {tmux_session_name}:{first_window_name}.{i} "python3 {script}" Enter

# set main-pane-width
!tmux setw -t {tmux_session_name} main-pane-width 50%

# Apply a layout to distribute panes evenly
!tmux select-layout -t {tmux_session_name}:{first_window_name} main-vertical
#!:set -g mouse on


# Optional: Attach to the tmux session (commented for Jupyter notebook)
#tmux attach-session -t devenv_session


Creating a new session: devenv_session


Running script train.py in the first pane
Creating pane 1 for script train.py


In [3]:
!which python3
!which pip
import MyUtils.Misc 


/bin/python3


/bin/pip


## Evalutate trained models

In [10]:
y_val = np.load(cfg.y_val_path)