In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
!ls

drive  sample_data


In [18]:
import multiprocessing
import os
import shutil

# Define the paths
gdrive_train_path = '/content/drive/Othercomputers/My MacBook Pro/train'
gdrive_test_path = '/content/drive/Othercomputers/My MacBook Pro/test'
local_train_path = '/content/oct_train'
local_test_path = '/content/oct_test'

def copy_data(src, dst):
    if os.path.exists(dst):
        shutil.rmtree(dst)  # Remove existing directory to ensure clean copy
    shutil.copytree(src, dst)

# Create a pool of worker processes
pool = multiprocessing.Pool()

# Copy the data concurrently
pool.apply_async(copy_data, args=(gdrive_train_path, local_train_path))
pool.apply_async(copy_data, args=(gdrive_test_path, local_test_path))

# Wait for all processes to finish
pool.close()
pool.join()

# Verify the copied data
train_file_count = sum([len(files) for r, d, files in os.walk(local_train_path)])
test_file_count = sum([len(files) for r, d, files in os.walk(local_test_path)])

print(f"Number of files in local train path: {train_file_count}")
print(f"Number of files in local test path: {test_file_count}")


Number of files in local train path: 11652
Number of files in local test path: 2867


In [19]:
!ls

drive	  oct_train    X_test.pickle   Y_test.pickle
oct_test  sample_data  X_train.pickle  Y_train.pickle


In [20]:
import os
import numpy as np
import pickle
import cv2
from tqdm import tqdm
import random

# Define the paths for your dataset
TRAIN_DIR = '/content/oct_train'  # Update with your actual path
TEST_DIR = '/content/oct_test'    # Update with your actual path
IMG_SIZE = 128  # Define your desired image size
CATEGORIES = ["1", "0"]  # Update with your actual categories

# Function to create training data
def create_training_data():
    training_data = []
    for category in CATEGORIES:
        path = os.path.join(TRAIN_DIR, category)
        class_num = CATEGORIES.index(category)
        for img in tqdm(os.listdir(path)):
            try:
                img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
                training_data.append([new_array, class_num])
            except Exception as e:
                pass
    random.shuffle(training_data)
    return training_data

# Create and save training data
training_data = create_training_data()
X_train = np.array([i[0] for i in training_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
Y_train = [i[1] for i in training_data]

with open("X_train.pickle", "wb") as pickle_out:
    pickle.dump(X_train, pickle_out)

with open("Y_train.pickle", "wb") as pickle_out:
    pickle.dump(Y_train, pickle_out)

# Repeat the same steps for the test data if needed
def create_test_data():
    test_data = []
    for category in CATEGORIES:
        path = os.path.join(TEST_DIR, category)
        class_num = CATEGORIES.index(category)
        for img in tqdm(os.listdir(path)):
            try:
                img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
                test_data.append([new_array, class_num])
            except Exception as e:
                pass
    random.shuffle(test_data)
    return test_data

# Create and save test data
test_data = create_test_data()
X_test = np.array([i[0] for i in test_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
Y_test = [i[1] for i in test_data]

with open("X_test.pickle", "wb") as pickle_out:
    pickle.dump(X_test, pickle_out)

with open("Y_test.pickle", "wb") as pickle_out:
    pickle.dump(Y_test, pickle_out)


100%|██████████| 10041/10041 [00:31<00:00, 317.35it/s]
100%|██████████| 1611/1611 [00:06<00:00, 259.75it/s]
100%|██████████| 2253/2253 [00:06<00:00, 352.77it/s]
100%|██████████| 614/614 [00:02<00:00, 218.37it/s]


In [21]:
!ls

drive	  oct_train    X_test.pickle   Y_test.pickle
oct_test  sample_data  X_train.pickle  Y_train.pickle


In [23]:
import pickle

# Load the pickled training data
with open("X_train.pickle", "rb") as pickle_in:
    X_train = pickle.load(pickle_in)

with open("Y_train.pickle", "rb") as pickle_in:
    Y_train = pickle.load(pickle_in)

# Load the pickled test data
with open("X_test.pickle", "rb") as pickle_in:
    X_test = pickle.load(pickle_in)

with open("Y_test.pickle", "rb") as pickle_in:
    Y_test = pickle.load(pickle_in)

# Print the number of images in each dataset
print(f"Number of images in X_train: {len(X_train)}")
print(f"Number of labels in Y_train: {len(Y_train)}")
print(f"Number of images in X_test: {len(X_test)}")
print(f"Number of labels in Y_test: {len(Y_test)}")


Number of images in X_train: 11649
Number of labels in Y_train: 11649
Number of images in X_test: 2867
Number of labels in Y_test: 2867


In [24]:

# Define the path to save pickle files in Google Drive
gdrive_path = '/content/drive/My Drive/pickle_files'

# Create the directory if it doesn't exist
if not os.path.exists(gdrive_path):
    os.makedirs(gdrive_path)

# Save training data to Google Drive
with open(os.path.join(gdrive_path, "X_train.pickle"), "wb") as pickle_out:
    pickle.dump(X_train, pickle_out)

with open(os.path.join(gdrive_path, "Y_train.pickle"), "wb") as pickle_out:
    pickle.dump(Y_train, pickle_out)

# Save test data to Google Drive
with open(os.path.join(gdrive_path, "X_test.pickle"), "wb") as pickle_out:
    pickle.dump(X_test, pickle_out)

with open(os.path.join(gdrive_path, "Y_test.pickle"), "wb") as pickle_out:
    pickle.dump(Y_test, pickle_out)

print(f"Pickle files saved to {gdrive_path}")


Pickle files saved to /content/drive/My Drive/pickle_files
