In [1]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
import tempfile

import io
from io import BytesIO
from urllib.parse import urlparse
import boto3

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from PIL import ImageFile, Image
import warnings

import logging
from botocore.exceptions import ClientError

import time

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from keras_tuner import HyperModel, RandomSearch
from sklearn import ensemble
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

2024-08-06 01:43:35.226360: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-06 01:43:35.231689: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-06 01:43:35.245980: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-06 01:43:35.272305: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-06 01:43:35.279336: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Initialize S3 client
s3_bucket = '207-images-labels'
session = boto3.Session(
     aws_access_key_id=key.access,
     aws_secret_access_key= key.secret)
client = session.client('s3')

In [8]:
def list_s3(prefix):
    files = []
    continue_token = None
    while True:
        if continue_token:  
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, ContinuationToken=continue_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        files.extend([item['Key'] for item in response.get('Contents', []) if item['Key'] != prefix])
        
        if response.get('IsTruncated'):
            continue_token = response.get('NextContinuationToken')
        else:
            break
    return files   

In [5]:
images_prefix = 'train_400/pre1/images/'
labels_prefix = 'train_400/pre1/labels/'
bucket_name = '207-images-labels'

In [6]:
train_images_prefix = 'train_400/pre1/images/'
train_labels_prefix = 'train_400/pre1/labels/'
test_images_prefix = 'test_400/pre1/labels/'
test_labels_prefix = 'test_400/pre1/labels/'

In [11]:
train_image_files = list_s3(train_images_prefix)
train_label_files = list_s3(train_labels_prefix)
print(f'Total image files: {len(train_image_files)}')
print(f'Total label files: {len(train_label_files)}')

Total image files: 3865
Total label files: 3865


In [22]:
test_image_files = list_s3(test_images_prefix)
test_label_files = list_s3(test_labels_prefix)
print(f'Total image files: {len(test_image_files)}')
print(f'Total label files: {len(test_label_files)}')

Total image files: 300
Total label files: 300


In [15]:
train_label_files

['train_400/pre1/labels/img_numpy_train_400_pixels_batch_1269_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1270_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1271_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1272_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1273_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1274_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1275_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1276_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1277_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1278_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1279_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_127_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_batch_1280_lbls.npy',
 'train_400/pre1/labels/img_numpy_train_400_pixels_b

In [13]:
def load_s3_memmap(key):
    try:
        print(f"Loading file from S3: {key}")
        response = s3.get_object(Bucket=bucket_name, Key=key)
        file_content = BytesIO(response['Body'].read())
        
        # Save the content to a temporary file
        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
            tmp_file.write(file_content.read())
            temp_filename = tmp_file.name
        
        # Use numpy memmap to load the array
        data = np.load(temp_filename, mmap_mode='r')
        print(f"Loaded file: {key}, shape: {data.shape}")
        
        # Clean up the temporary file
        os.remove(temp_filename)
        
        return data
    except Exception as e:
        print(f"Error loading file {key}: {e}")
        return None

In [33]:
sum_X = 0
for i in train_label_files:
    sum_X += np.sum(load_s3_memmap(i))
    print(sum_X)

Loading file from S3: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1269_lbls.npy
Loaded file: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1269_lbls.npy, shape: (20,)
10
Loading file from S3: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1270_lbls.npy
Loaded file: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1270_lbls.npy, shape: (20,)
20
Loading file from S3: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1271_lbls.npy
Loaded file: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1271_lbls.npy, shape: (20,)
32
Loading file from S3: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1272_lbls.npy
Loaded file: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1272_lbls.npy, shape: (20,)
40
Loading file from S3: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1273_lbls.npy
Loaded file: train_400/pre1/labels/img_numpy_train_400_pixels_batch_1273_lbls.npy, shape: (20,)
40
Loading file from S3: train_400/pre1/labels/img_nu

In [19]:
32732/2

16366.0

In [20]:
38650- 16366

22284

In [27]:
sum_test = 0
for i in test_label_files:
    sum_test += np.sum(load_s3_memmap(i))
    print(sum_test)

Loading file from S3: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1000_lbls.npy
Loaded file: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1000_lbls.npy, shape: (20,)
8
Loading file from S3: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1001_lbls.npy
Loaded file: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1001_lbls.npy, shape: (20,)
22
Loading file from S3: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1002_lbls.npy
Loaded file: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1002_lbls.npy, shape: (20,)
32
Loading file from S3: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1003_lbls.npy
Loaded file: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1003_lbls.npy, shape: (20,)
40
Loading file from S3: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1004_lbls.npy
Loaded file: test_400/pre1/labels/img_numpy_train_400_pixels_batch_1004_lbls.npy, shape: (20,)
48
Loading file from S3: test_400/pre1/labels/img_numpy_train_40

In [25]:
2556/2

1278.0

In [26]:
3000-1278

1722

In [2]:
def dummy_classifer(total_X, sum_X):
    return (((total_X*20)-sum_X)/(total_X*20))

In [49]:
dummy_classifer(len(train_image_files), sum_X)

0.5765588615782665

In [50]:
dummy_classifer(len(test_image_files), sum_test)

0.574

In [9]:
num_total_files= 2070
num_test_aug = 17480

In [10]:
dummy_classifer(num_total_files, num_test_aug)

0.5777777777777777