<a href="https://colab.research.google.com/github/grfa5712/CSPB3202-HW5/blob/main/HW5_submission5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Remember to switch hardware accelerator to T4 GPU in Colab
!pip install -q kaggle

In [None]:
#Before proceeding, generate a token from Kaggle and save to local drive; use files.upload to navigate to and upload kaggle.json file from local drive to Colab
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle

In [None]:
!mv kaggle.json ~/.kaggle/

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c histopathologic-cancer-detection
import zipfile

In [None]:
with zipfile.ZipFile('/content/histopathologic-cancer-detection.zip', 'r') as zip_ref:
  zip_ref.extractall('/content/histopathologic_dataset')

In [None]:
#Import libraries and tools
import numpy as np
import pandas as pd
import os
import random
from sklearn.utils import shuffle
import shutil
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, load_model
from keras.layers import RandomFlip, RandomZoom, RandomRotation
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D, BatchNormalization
from keras.layers import Dense, Flatten, Dropout
from keras.optimizers import SGD, Adam, RMSprop

sns.set_style("darkgrid")

In [None]:
#Access paths and .csv files in histopathologic_dataset folder
testPath = './histopathologic_dataset/test'
trainPath = './histopathologic_dataset/train'
trainData = pd.read_csv('./histopathologic_dataset/train_labels.csv')
sampleSubmission = pd.read_csv('./histopathologic_dataset/sample_submission.csv')


In [None]:
#Display format of trainData (id and label)
trainData.head()

In [None]:
#Use info to check for null values
trainData.info()

In [None]:
#Visualize counts with histogram
sns.countplot(x=trainData['label']).set(xlabel='Labels (0 = No Tumor, 1 = Tumor)', ylabel= 'Count', title='Counts by Label')

In [None]:
#Print count for each label
print(pd.DataFrame(data={'Counts by Label': trainData['label'].value_counts()}))

In [None]:
#Create pie chart visualization based on labels
labels_count = trainData.label.value_counts()

%matplotlib inline
plt.pie(labels_count, labels=['No Tumor', 'Tumor'], startangle=180,
        autopct='%1.1f', shadow=True)
plt.figure(figsize=(16,16))
plt.show()

In [None]:
#Display samples from both no tumor and tumor sets to show difficulty in differentiating by eye

imageCount = 4

trainData["path"] = trainData["id"].apply(lambda x: os.path.join("./histopathologic_dataset/train", str(x) + ".tif"))
imageLabel0 = trainData[trainData["label"] == 0]
imageLabel1 = trainData[trainData["label"] == 1]

for i in range(imageCount):
    image = plt.imread(imageLabel0["path"].iloc[i])

    plt.subplot(2, imageCount, i+1)
    plt.imshow(image)
    plt.axis('off')
    plt.title("No Tumor")

for i in range(imageCount):
    image = plt.imread(imageLabel1["path"].iloc[i])

    plt.subplot(2, imageCount, imageCount + i + 1)
    plt.imshow(image)
    plt.axis('off')
    plt.title("Tumor")


plt.show()

In [None]:
#Create constant for batch size to use in model below
BATCH_SIZE = 64

In [None]:
#Prepare data for training, select random sample from both 0 and 1 labels, and then shuffle
def append_tif(string):
  return string+".tif"

trainData['id'] = trainData["id"].apply(append_tif)
trainData['label'] = trainData['label'].astype(str)

In [None]:
trainData0 = trainData[trainData['label']=='0'].sample(20000,random_state=42)
trainData1 = trainData[trainData['label']=='1'].sample(20000,random_state=42)
trainData = pd.concat([trainData0, trainData1], axis=0).reset_index(drop=True)
trainData = shuffle(trainData, random_state=42)
trainData['label'].value_counts()

In [None]:
#Normalize training data and split into training and validation sets
datagen = ImageDataGenerator(rescale=1./255., validation_split=0.2)

In [None]:
#Generate training data
trainGenerator = datagen.flow_from_dataframe(
    dataframe=trainData,
    directory=trainPath,
    x_col='id',
    y_col='label',
    subset='training',
    batch_size=BATCH_SIZE,
    seed=42,
    class_mode='binary',
    target_size=(96,96)
)

In [None]:
#Generate validation data
validGenerator = datagen.flow_from_dataframe(
    dataframe=trainData,
    directory=trainPath,
    x_col='id',
    y_col='label',
    subset='validation',
    batch_size=BATCH_SIZE,
    seed=42,
    class_mode='binary',
    target_size=(96,96)
)

In [None]:
#create model
model = Sequential()

model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu', ))
model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.build(input_shape=(BATCH_SIZE, 96, 96, 3))

ROC = tf.keras.metrics.AUC()

model.summary()

In [None]:
model.compile(loss='binary_crossentropy', metrics=['accuracy', ROC], optimizer=RMSprop(learning_rate=0.001))

In [None]:
# train the model
historyModel = model.fit_generator(
                        trainGenerator,
                        epochs = 10,
                        validation_data = validGenerator)

In [None]:
history_dict = historyModel.history
print(history_dict.keys())

In [None]:
#Plot metrics for model
plt.plot(historyModel.history['accuracy'])
plt.plot(historyModel.history['val_accuracy'])
plt.title('Accuracy per Epoch')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validate'], loc='lower right')
plt.show();

plt.plot(historyModel.history['loss'])
plt.plot(historyModel.history['val_loss'])
plt.title('Loss per Epoch')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validate'], loc='upper right')
plt.show();

plt.plot(historyModel.history['auc'])
plt.plot(historyModel.history['val_auc'])
plt.title('AUC ROC per Epoch')
plt.ylabel('ROC')
plt.xlabel('Epoch')
plt.legend(['train', 'validate'], loc='lower right')
plt.show();

In [None]:
#create a dataframe to run the predictions
testDf = pd.DataFrame({'id':os.listdir(testPath)})
testDf.head()

In [None]:
#Prepare test data for submission
datagenTest = ImageDataGenerator(rescale=1./255.)

testGenerator = datagenTest.flow_from_dataframe(
    dataframe=testDf,
    directory=testPath,
    x_col='id',
    y_col=None,
    target_size=(96,96),
    batch_size=1,
    shuffle=False,
    class_mode=None)

In [None]:
predictions = model.predict(testGenerator, verbose=1)

In [None]:
#Prepare dataframe for submission
predictions = np.transpose(predictions)[0]
submissionDf = pd.DataFrame()
submissionDf['id'] = testDf['id'].apply(lambda x: x.split('.')[0])
submissionDf['label'] = list(map(lambda x: 0 if x < 0.5 else 1, predictions))

In [None]:
submissionDf.head()

In [None]:
#Print test prediction counts
submissionDf['label'].value_counts()

In [None]:
#Convert to csv to upload to Kaggle
submissionDf.to_csv('submission.csv', index=False)