## PCAM Dataset Classification using K Folds autotuning
21BAI1007 - Goutham Krishnan

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from keras.optimizers import RMSprop, SGD, Adam
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import cv2

### Loading the data - 21BAI1007

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [3]:
!kaggle competitions download -c histopathologic-cancer-detection

Downloading histopathologic-cancer-detection.zip to /content
100% 6.31G/6.31G [04:53<00:00, 25.4MB/s]
100% 6.31G/6.31G [04:53<00:00, 23.1MB/s]


In [4]:
import zipfile
zip_ref = zipfile.ZipFile('/content/histopathologic-cancer-detection.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [5]:
df=pd.read_csv('/content/train_labels.csv')
df

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0
...,...,...
220020,53e9aa9d46e720bf3c6a7528d1fca3ba6e2e49f6,0
220021,d4b854fe38b07fe2831ad73892b3cec877689576,1
220022,3d046cead1a2a5cbe00b2b4847cfb7ba7cf5fe75,0
220023,f129691c13433f66e1e0671ff1fe80944816f5a2,0


### Preprocessing the data - 21BAI1007

In [6]:
df["path"] = df["id"].apply(lambda x: os.path.join("/content/train", str(x) + ".tif"))

In [7]:
df["label"] = df["label"].astype(str)
df_0 = df[df["label"] == "0"].sample(10000, random_state=42)
df_1 = df[df["label"] == "1"].sample(10000, random_state=42)
df_subset = pd.concat([df_0, df_1], ignore_index=True)

train_file_paths, test_file_paths, train_labels, test_labels = train_test_split(df_subset["path"], df_subset["label"], test_size=0.2, random_state=42)

In [8]:
import shutil

train_dir = "train_data"
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
os.makedirs(train_dir)
os.makedirs(os.path.join(train_dir, "0"))
os.makedirs(os.path.join(train_dir, "1"))
for file_path, label in zip(train_file_paths, train_labels):
    name = file_path.split("/")[-1]
    if label == "0":
        shutil.copy2(file_path, os.path.join(train_dir, "0", name))
    else:
        shutil.copy2(file_path, os.path.join(train_dir, "1", name))

In [9]:
test_dir = "test_data"
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
os.makedirs(test_dir)
os.makedirs(os.path.join(test_dir, "0"))
os.makedirs(os.path.join(test_dir, "1"))
for file_path, label in zip(test_file_paths, test_labels):
    name = file_path.split("/")[-1]
    if label == "0":
        shutil.copy2(file_path, os.path.join(test_dir, "0", name))
    else:
        shutil.copy2(file_path, os.path.join(test_dir, "1", name))

In [10]:
label_0_path_train = '/content/train_data/0'
label_1_path_train = '/content/train_data/1'

In [11]:
def load_images(file_paths, label):
    images = []
    labels = []
    for file_path in file_paths:
        image = cv2.imread(file_path, cv2.IMREAD_UNCHANGED)
        image = cv2.resize(image, (64, 64))
        images.append(image)
        labels.append(label)
    return np.array(images), np.array(labels)

In [12]:
label_0_file_paths = [os.path.join(label_0_path_train, file) for file in os.listdir(label_0_path_train)]
X_label_0, Y_label_0 = load_images(label_0_file_paths, 0)

label_1_file_paths = [os.path.join(label_1_path_train, file) for file in os.listdir(label_1_path_train)]
X_label_1, Y_label_1 = load_images(label_1_file_paths, 1)

In [13]:
x = np.concatenate((X_label_0, X_label_1), axis=0)
y = np.concatenate((Y_label_0, Y_label_1), axis=0)

### Creating the model - 21BAI1007

In [14]:
seed = 7
np.random.seed(seed)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
cvscores = []

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x.reshape(x.shape[0], -1)).reshape(x.shape)

In [16]:
for train, test in kfold.split(x, y):

  model = Sequential()

  model.add(Conv2D(filters=16, kernel_size=(3,3)))
  model.add(Conv2D(filters=16, kernel_size=(3,3)))
  model.add(MaxPooling2D(pool_size=(2,2)))

  model.add(Conv2D(filters=32, kernel_size=(3,3)))
  model.add(Conv2D(filters=32, kernel_size=(3,3)))
  model.add(Flatten())
  model.add(Dense(1, activation='sigmoid'))

  model.build(input_shape=(32, 64, 64, 3))

  model.compile(loss='binary_crossentropy', metrics=['accuracy'])

  model.fit(x[train],y[train], steps_per_epoch=687,epochs = 5,validation_data = (x[test],y[test]),validation_steps=171,verbose=1)

  scores = model.evaluate(x[test], y[test], verbose = 0)
  print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
  cvscores.append(scores[1] * 100)

Epoch 1/5



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



accuracy: 70.81%
Epoch 1/5



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



accuracy: 71.78%
Epoch 1/5



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



accuracy: 71.78%
Epoch 1/5



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



accuracy: 70.78%
Epoch 1/5



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



accuracy: 69.31%


In [19]:
import numpy as np
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

70.89% (+/- 0.90%)
