In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("samuelcortinhas/cats-and-dogs-image-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/samuelcortinhas/cats-and-dogs-image-classification?dataset_version_number=4...


100%|██████████| 64.4M/64.4M [00:00<00:00, 91.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/samuelcortinhas/cats-and-dogs-image-classification/versions/4


In [7]:
# === Cell 1: Import libraries ===
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import joblib

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [8]:
# Define paths (adjust these paths as necessary)
DATASET_PATH = '/root/.cache/kagglehub/datasets/samuelcortinhas/cats-and-dogs-image-classification/versions/4'
TRAIN_PATH = os.path.join(DATASET_PATH, 'train')
TEST_PATH = os.path.join(DATASET_PATH, 'test')

In [9]:
# Define image parameters
IMG_WIDTH, IMG_HEIGHT = 128, 128  # You can adjust as needed
CHANNELS = 3  # Use 1 for grayscale or 3 for RGB

In [10]:
# === Cell 2: Data Loading and Preprocessing for scikit-learn models ===
def load_images(path, img_size=(IMG_WIDTH, IMG_HEIGHT), flatten=True, color=True):
    images = []
    labels = []
    classes = ['cats', 'dogs']
    for label in classes:
        folder = os.path.join(path, label)
        for img_path in glob(os.path.join(folder, '*.jpeg')) + glob(os.path.join(folder, '*.jpg')) + glob(os.path.join(folder, '*.png')):
            img = cv2.imread(img_path)
            if img is None:
                continue
            if color:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img = cv2.resize(img, img_size)
            if flatten:
                img = img.flatten()
            images.append(img)
            labels.append(label)
    return np.array(images), np.array(labels)


In [11]:
# Load training data for traditional ML models (using grayscale for simplicity)
X, y = load_images(TRAIN_PATH, flatten=True, color=False)
print("Loaded training images for traditional models:", X.shape)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Scale pixel values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.astype('float32'))

# Split into train/validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)


Loaded training images for traditional models: (557, 16384)


In [12]:
# === Cell 3: Train scikit-learn models ===
# SVM
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_val)
print("SVM Accuracy:", accuracy_score(y_val, y_pred))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)
print("Random Forest Accuracy:", accuracy_score(y_val, y_pred))

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_val)
print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred))

# K-means Clustering (unsupervised – note: this won’t directly predict labels)
kmeans_model = KMeans(n_clusters=2, random_state=42)
kmeans_model.fit(X_train)
# For demonstration, we assign clusters to labels (this is heuristic)
print("K-means clustering centers:", kmeans_model.cluster_centers_)

SVM Accuracy: 0.5625
Random Forest Accuracy: 0.5982142857142857
Logistic Regression Accuracy: 0.5714285714285714
K-means clustering centers: [[ 0.7147157   0.7126295   0.71017164 ...  0.74235755  0.7440248
   0.74322426]
 [-0.48604226 -0.4907742  -0.49254894 ... -0.50438845 -0.5145112
  -0.5062493 ]]


In [13]:
# Save the scikit-learn models
os.makedirs('/content/models', exist_ok=True)
joblib.dump(svm_model, '/content/models/svm_model.pkl')
joblib.dump(rf_model, '/content/models/rf_model.pkl')
joblib.dump(lr_model, '/content/models/lr_model.pkl')
joblib.dump(kmeans_model, '/content/models/kmeans_model.pkl')
joblib.dump(scaler, '/content/models/scaler.pkl')
joblib.dump(le, '/content/models/label_encoder.pkl')

['/content/models/label_encoder.pkl']

In [14]:
# === Cell 4: Data Loading and Preprocessing for CNN ===
# Use ImageDataGenerator for CNN – assuming images are in color.
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator = train_datagen.flow_from_directory(
    TRAIN_PATH,
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=32,
    class_mode='categorical',
    subset='training'
)

validation_generator = train_datagen.flow_from_directory(
    TRAIN_PATH,
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

Found 447 images belonging to 2 classes.
Found 110 images belonging to 2 classes.


In [15]:
# === Cell 5: Build and Train CNN Model ===
cnn_model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, CHANNELS)),
    MaxPooling2D(pool_size=(2,2)),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(pool_size=(2,2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
cnn_model.fit(train_generator, epochs=100, validation_data=validation_generator)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


Epoch 1/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1s/step - accuracy: 0.5657 - loss: 1.1616 - val_accuracy: 0.5000 - val_loss: 0.6821
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.5168 - loss: 0.6702 - val_accuracy: 0.5091 - val_loss: 0.6912
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.6357 - loss: 0.6633 - val_accuracy: 0.5818 - val_loss: 0.6964
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 1s/step - accuracy: 0.7127 - loss: 0.5822 - val_accuracy: 0.5818 - val_loss: 0.6730
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.8028 - loss: 0.5020 - val_accuracy: 0.5545 - val_loss: 0.6935
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.8396 - loss: 0.3941 - val_accuracy: 0.6636 - val_loss: 0.6547
Epoch 7/100
[1m14/14[0m [32m━━━

<keras.src.callbacks.history.History at 0x7846076298d0>

In [16]:
# Evaluate CNN on validation set
cnn_loss, cnn_acc = cnn_model.evaluate(validation_generator)
print("CNN Accuracy:", cnn_acc)

# Save CNN model
cnn_model.save('/content/models/cnn_model.h5')

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 241ms/step - accuracy: 0.7670 - loss: 1.6217




CNN Accuracy: 0.7090908885002136
