In [None]:
cat_data = np.load(’/cat_images.npy’) # make sure these point to the correct directory
dog_data = np.load(’/dog_images.npy’)
# our images are 540x499 pixels
img_rows, img_cols = 540, 499
# create label vector (we’ll say 1 = Cat, 0 = Dog)
y = np.concatenate((np.ones(cat_data.shape[0]), np.zeros(dog_data.shape[0])))
# put the data together
pet_data = np.concatenate((cat_data, dog_data))
# split the data into training and test
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(
pet_data, y, test_size=0.25, random_state=42
)
# Normalize image data
x_train = x_train / 255.0
x_test = x_test / 255.0
# Reshape to add channel dimension for Keras
x_train = x_train[..., np.newaxis]
x_test = x_test[..., np.newaxis]

In [None]:
# Import necessary Keras/TensorFlow modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

# keras/tensorflow imports
import keras
from keras.models import Model
from keras.layers import Dense, Input, Conv2D, MaxPooling2D, Flatten
from keras.optimizers import Adam
from keras.losses import binary_crossentropy

In [None]:
# Build enhanced CNN model with 2 convolutional layers and 2 hidden layers
# Architecture:
# - Conv2D(32 filters, 3x3 kernel) with padding
# - MaxPooling2D
# - Conv2D(64 filters, 3x3 kernel) 
# - MaxPooling2D
# - Flatten
# - Dense(128, ReLU)
# - Dense(64, ReLU)
# - Dense(1, Sigmoid) for binary classification

inpx = Input(shape=(img_rows, img_cols, 1))

# First convolutional layer with padding and ReLU activation
conv_layer1 = Conv2D(1, kernel_size=(3, 3), strides=1, padding='same', activation='relu')(inpx)
pool_layer1 = MaxPooling2D(pool_size=(2, 2))(conv_layer1)

# Second convolutional layer with ReLU activation
conv_layer2 = Conv2D(1, kernel_size=(3, 3), strides=1, padding='same', activation='relu')(pool_layer1)
pool_layer2 = MaxPooling2D(pool_size=(2, 2))(conv_layer2)

# Flatten the output from convolutional layers
flat_layer = Flatten()(pool_layer2)

# First hidden layer with ReLU activation
hid_layer1 = Dense(128, activation='relu')(flat_layer)

# Second hidden layer with ReLU activation
hid_layer2 = Dense(64, activation='relu')(hid_layer1)

# Output layer with Sigmoid for binary classification
out_layer = Dense(1, activation='sigmoid')(hid_layer2)

# Create the model
model = Model([inpx], out_layer)

# Display model architecture
model.summary()

In [None]:
# Compile the model with Adam optimizer
# Adam is more efficient than SGD and adapts the learning rate automatically
model.compile(optimizer=Adam(learning_rate=0.001),
              loss=binary_crossentropy,
              metrics=['accuracy'])

In [None]:
# Train the model with optimizations:
# - batch_size=32 for faster training
# - epochs=20 for better convergence
# - verbose=1 to see training progress
history = model.fit(x_train, y_train, 
                    batch_size=32,
                    epochs=20, 
                    validation_split=0.2,
                    verbose=1)

In [None]:
# Evaluate the model on test set
score = model.evaluate(x_test, y_test, verbose=0)
print('Test Loss:', score[0])
print('Test Accuracy:', score[1])

In [None]:
# Generate predictions and display confusion matrix
preds = model.predict(x_test)
y_pred = (preds.reshape(-1) >= 0.5).astype(int)
y_true = y_test.astype(int)

# Create confusion matrix
tbl = pd.crosstab(y_pred, y_true, rownames=['Predicted'], colnames=['Actual'])
print("\nConfusion Matrix:")
print(tbl)
print(f"\nTrue Negatives: {tbl.iloc[0, 0]}")
print(f"False Positives: {tbl.iloc[1, 0]}")
print(f"False Negatives: {tbl.iloc[0, 1]}")
print(f"True Positives: {tbl.iloc[1, 1]}")

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Plot loss
axes[0].plot(history.history['loss'], label='Training Loss')
axes[0].plot(history.history['val_loss'], label='Validation Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Model Loss')
axes[0].legend()
axes[0].grid(True)

# Plot accuracy
axes[1].plot(history.history['accuracy'], label='Training Accuracy')
axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Model Accuracy')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
from PIL import Image, ImageOps

def convert_image_to_grayscale_with_padding(image_path, target_width=None, target_height=None):
    """
    Converts a single .jpg image to grayscale and optionally pads it to the target dimensions.

    Args:
        image_path (str): Path to the input .jpg image.
        target_width (int, optional): Target width for padding. If None, keeps the original width.
        target_height (int, optional): Target height for padding. If None, keeps the original height.

    Returns:
        np.ndarray: Grayscale image as a numpy array, with optional padding applied.
    """
    # Open the image
    with Image.open(image_path) as img:
        # Convert to grayscale
        grayscale_img = img.convert('L')
        
        # If target dimensions are provided, pad the image
        if target_width and target_height:
            padded_img = ImageOps.pad(grayscale_img, (target_width, target_height), color=0)
        else:
            padded_img = grayscale_img
        
        # Convert the image to a numpy array
        grayscale_array = np.array(padded_img)

    return grayscale_array

test = convert_image_to_grayscale_with_padding('Lafayette.jpg', 499, 540)

###Question 3 from the HW:

Links: 
https://www.openslr.org/83/
https://huggingface.co/datasets/ylacombe/english_dialects 


Description of the problem of interest: As someone who has spent time studying abroad in England and has visited various parts of the UK, I've always wondered how to tell apart different accents around the Kingdom. More specifically the regional ones in England itself. There are quite a few that I specifically mix up often, so I think using a CNN to classify UK specific accents from a speech dataset would be interesting and doable. All of the audio is spoken in english but in varying accents. 

In terms of ethical concerns, the main one could be the use of people's voice if they are not consenting to being recorded. I will say that the dataset I linked contains speech recorded by volunteers, but if we use audio that was recorded without consent that would lead to a ethical issue. I don't think there would be any ethical issues with using the results of the model. 

My intuition for why a CNN is more appropriate than the other models and techniques we've covered in this class is that the CNN can use spectrograms (time x frequency of audio) like images, which has a 2D structure. Similar to image data, the audio data would have spacial relationships that simpler models wouldn't be able to learn or use. Also, we need deep learning in this use case because the patterns in the data will be complex, requiring the use of multi-layer networks. Simpler models are not able to pick up on such complex relationships in the data. And the use of convolutional layers can break down the data into lower dimensional space which will help computational efficiency and help avoid having too may features (the scary curse of dimensionality)