## PCam dataset classification - 21BAI1007

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [3]:
!kaggle competitions download -c histopathologic-cancer-detection

Downloading histopathologic-cancer-detection.zip to /content
100% 6.30G/6.31G [01:03<00:00, 196MB/s]
100% 6.31G/6.31G [01:03<00:00, 107MB/s]


### Loading Dataset - 21BAI1007

In [4]:
import zipfile
zip_ref = zipfile.ZipFile('/content/histopathologic-cancer-detection.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [8]:
import pandas as pd
import numpy as np

import tensorflow as tf
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation

import cv2
import matplotlib.pyplot as plt
%matplotlib inline
import os

In [15]:
test_path = '../content/test/'
train_path = '../content/train/'
train_data = pd.read_csv('../content/train_labels.csv')

Labels <br>
0 = no tumor
1 = tumor

In [11]:
# No of images in each folder
print(len(os.listdir('../content/train')))
print(len(os.listdir('../content/test')))

220025
57458


In [16]:
train_data.info()
print("")
print(train_data.head())
print("")
print(train_data.describe())
print("")
print(len(os.listdir(test_path)))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220025 entries, 0 to 220024
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      220025 non-null  object
 1   label   220025 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.4+ MB

                                         id  label
0  f38a6374c348f90b587e046aac6079959adf3835      0
1  c18f2d887b7ae4f6742ee445113fa1aef383ed77      1
2  755db6279dae599ebb4d39a9123cce439965282d      0
3  bc3f0c64fb968ff4a8bd33af6971ecae77c75e08      0
4  068aba587a4950175d04c680d38943fd488d6a9d      0

               label
count  220025.000000
mean        0.405031
std         0.490899
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000

57458


### Preprocessing and imageGeneration - 21BAI1007

In [17]:
train_data["id"] = train_data["id"].apply(lambda x: x + ".tif")
train_data["label"] = train_data["label"].astype(str)

In [18]:
datagen = ImageDataGenerator(rescale=1./255., validation_split=0.2)

In [19]:
train_generator = datagen.flow_from_dataframe(
    dataframe=train_data,
    directory=train_path,
    x_col="id",
    y_col="label",
    subset="training",
    batch_size=256,
    seed=13,
    class_mode="binary",
    target_size=(64,64),
    shuffle=True)

Found 176020 validated image filenames belonging to 2 classes.


In [20]:
valid_generator = datagen.flow_from_dataframe(
    dataframe=train_data,
    directory=train_path,
    x_col="id",
    y_col="label",
    subset="validation",
    batch_size=256,
    seed=13,
    class_mode="binary",
    target_size=(64,64),
    shuffle=True)

Found 44005 validated image filenames belonging to 2 classes.


### Creating the model - 21BAI1007

In [22]:
model = Sequential()

model.add(Conv2D(filters=16, kernel_size=(3,3)))
model.add(Conv2D(filters=16, kernel_size=(3,3)))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(filters=32, kernel_size=(3,3)))
model.add(Conv2D(filters=32, kernel_size=(3,3)))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.build(input_shape=(32, 64, 64, 3))

model.compile(loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (32, 62, 62, 16)          448       
                                                                 
 conv2d_5 (Conv2D)           (32, 60, 60, 16)          2320      
                                                                 
 max_pooling2d_1 (MaxPoolin  (32, 30, 30, 16)          0         
 g2D)                                                            
                                                                 
 conv2d_6 (Conv2D)           (32, 28, 28, 32)          4640      
                                                                 
 conv2d_7 (Conv2D)           (32, 26, 26, 32)          9248      
                                                                 
 flatten_1 (Flatten)         (32, 21632)               0         
                                                      

In [25]:
model.fit(train_generator,steps_per_epoch=687,epochs = 5,validation_data = valid_generator,validation_steps=171,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7838fd7c36d0>

In [27]:
test_data = pd.DataFrame({'id':os.listdir(test_path)})
test_data.head()

Unnamed: 0,id
0,c44ad7c19871ab1d338b55a0d5634e2cfa886a44.tif
1,0a3230028b14c2079b2e11216ea1e54552c1bfe2.tif
2,bc709dd33f5388cba19f3eee31bfceaaad5b4580.tif
3,542688e2352257d33b51887c18fcd4250d5bfec7.tif
4,653777df3c7c8d6611233c22dae5977b00061c63.tif


### Prediciting the test dataset - 21BAI1007

In [28]:
datagen_test = ImageDataGenerator(rescale=1./255.)

test_generator = datagen_test.flow_from_dataframe(
    dataframe=test_data,
    directory=test_path,
    x_col='id',
    y_col=None,
    target_size=(64,64),
    batch_size=1,
    shuffle=False,
    class_mode=None)

Found 57458 validated image filenames.


In [30]:
results = model.predict(test_generator, verbose=1)



In [32]:
results

array([[0.08220385],
       [0.51981044],
       [0.32235596],
       ...,
       [0.5719396 ],
       [0.07966693],
       [0.38588288]], dtype=float32)

In [33]:
results = np.transpose(results)[0]

In [35]:
answer = list(map(lambda x: 0 if x < 0.5 else 1, results))

In [36]:
answer

[0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
