# Setup Jupyter notebook

In [1]:
from pathlib import Path
import sys

notebook_directory_parent = Path.cwd().resolve().parent.parent.parent
if str(notebook_directory_parent) not in sys.path:
    sys.path.append(str(notebook_directory_parent))

# Setup to use Python libraries/modules

In [2]:
from CuISOX.DataWrangling.Kaggle.DigitRecognizer import ProcessDigitsData
from CuISOX.utilities.configure_paths import DataPaths
from CuISOX.utilities.DataIO.KagglePaths import KagglePaths

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [4]:
data_paths = DataPaths()
kaggle_paths = KagglePaths()
kaggle_data_file_paths = kaggle_paths.get_all_data_file_paths()
digit_paths = kaggle_data_file_paths["DigitRecognizer"]

# Load Data
See [Long-Short Term Memory with Pytorch](https://www.kaggle.com/code/kanncaa1/long-short-term-memory-with-pytorch)

Let's first locate where our data is.

In [5]:
training_data_paths = DataPaths.get_path_with_substring(digit_paths, "train")
training_data_path = data_paths.Kaggle() / training_data_paths[0]
print(training_data_path)

/cuBlackDream/Data/Kaggle/DigitRecognizer/digit-recognizer/train.csv


We can also find the test data.

In [6]:
print(kaggle_data_file_paths)
testing_data_paths = DataPaths.get_path_with_substring(digit_paths, "test")
testing_data_path = data_paths.Kaggle() / testing_data_paths[0]
print(testing_data_path)

{'DigitRecognizer': [PosixPath('DigitRecognizer/digit-recognizer/sample_submission.csv'), PosixPath('DigitRecognizer/digit-recognizer/test.csv'), PosixPath('DigitRecognizer/digit-recognizer/train.csv')]}
/cuBlackDream/Data/Kaggle/DigitRecognizer/digit-recognizer/test.csv


## Loading **Training** Data

In [None]:
# This uses a module that does all the subsequent, following, steps.Those steps were done to
# explicitly show each step. Either use the module that wraps up all those steps or step
# through the steps.

process_digits_data = ProcessDigitsData()

process_digits_data.parse_csv(training_data_path)
process_digits_data.load_data()

print(len(process_digits_data.training_loader.dataset))
print(len(process_digits_data.test_loader.dataset))

assert (len(process_digits_data.training_loader.dataset) == 33600)
assert (len(process_digits_data.test_loader.dataset) == 8400)

Starting from here are all the steps, explicitly shown, as the same as using ProcessDigitsData.

In [None]:
train = pd.read_csv(training_data_path, dtype= np.float32)
print(train.columns.tolist())
print(train.size)
print(train.shape)

Split data into features(pixesl) and labels (numbers from 0 to 9)
See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In this case, y is "categorical" data, such that y can be from 0, 1, to 9; i.e. y is a non-negative integer.
X in this case has D = 784 features and we normalize by 255. Each row is a data sample, N = 42000.

In [None]:
y_targets_numpy = train.label.values
X_features_numpy = train.loc[:,train.columns != "label"].values / 255 # normalization
X_features_train, X_features_test, y_targets_train, y_targets_test = train_test_split(
    X_features_numpy,
    y_targets_numpy,
    test_size = 0.2,
    random_state = 42)

print(type(X_features_train))
print(type(X_features_test))
print(type(y_targets_train))
print(type(y_targets_test))
print(X_features_train.shape)
print(X_features_test.shape)
print(y_targets_train.shape)
print(y_targets_test.shape)
print(len(X_features_train))
print(len(X_features_test))
print(len(y_targets_train))
print(len(y_targets_test))

Create feature and targets tensor for training set. We need a variable to accumulate gradients. Therefore, first create tensor, and then we'll create variable.

In [None]:
X_features_training = torch.from_numpy(X_features_train)
y_targets_training = torch.from_numpy(y_targets_train).type(torch.LongTensor) # data type is long

# Create feature and targets tensor for test set.
X_features_testing = torch.from_numpy(X_features_test)
y_targets_testing = torch.from_numpy(y_targets_test).type(torch.LongTensor) # data type is long

print(type(X_features_training), type(y_targets_training))
print(X_features_training.shape)
print(y_targets_training.shape)
print(X_features_training.size())
print(y_targets_training.size())
print(len(X_features_training))
print(len(y_targets_training))
print(X_features_testing.shape)
print(y_targets_testing.shape)
print(X_features_testing.size())
print(y_targets_testing.size())
print(len(X_features_testing))
print(len(y_targets_testing))

## Model Parameters and Sizes, Configuration

Batch size, epoch, and iteration

Suppose $B \equiv$ batch size.

Take the total number of samples $N$ and divide by $B$ so to get "number of batches". Given $N_{\text{iters}} \equiv$ total number of iterations, with each iteration doing 1 batch, we can get the total number of epochs.

In [None]:
batch_size = 100
n_iters = 6000
num_epochs = int( n_iters / (len(X_features_training) / batch_size))
print("Epoch Number: ", num_epochs)

In [None]:
# Pytorch training and test sets
training = TensorDataset(X_features_training, y_targets_training)
testing = TensorDataset(X_features_testing, y_targets_testing)

# Data Loader
training_loader = DataLoader(training, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(testing, batch_size=batch_size, shuffle=False)

In [None]:
# Visualize one of the images in data set.
plt.imshow(X_features_numpy[42].reshape(28, 28))
plt.axis("off")
plt.title(str(y_targets_numpy[42]))
plt.show()

## Exploring Training Data

In [None]:
training_data_iterator = enumerate(process_digits_data.training_loader)

In [None]:
i, (image_batch, batch_labels) = training_data_iterator.__next__()

In [None]:
print(i)
print(type(image_batch))
print(type(batch_labels))
print(image_batch.shape)
print(batch_labels.shape)
print(image_batch.size())
print(batch_labels.size())

`.view()` Returns a new tensor with the same data as self tensor but of a different *shape*. See [torch.tensor.view](https://pytorch.org/docs/stable/generated/torch.Tensor.view.html)

In [None]:
image_batch_transformed = image_batch.view(-1,  28, 28).requires_grad_()
print(type(image_batch_transformed))
print(image_batch_transformed.shape)
print(image_batch_transformed.size())

# Exploring Testing data

In [8]:
print(testing_data_path)
test_X = ProcessDigitsData.parse_csv_no_split(testing_data_path)
test_data_loader = ProcessDigitsData.load_data_no_split(test_X, 100)

/cuBlackDream/Data/Kaggle/DigitRecognizer/digit-recognizer/test.csv


In [None]:
# Miscellaneous work
df = pd.read_csv(testing_data_path)
df.columns