# <center>Brain tumor classification using convolutional neural networks<center>


## Generic library imports

In [1]:
#Base imports
import sys
import os
import pathlib
import importlib # for reloading local class 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
%matplotlib inline


## Check notebook environment 
- Runtime environment: Google Colab or local
- Setup paths for reading datasets and saving outputs (models, figures, etc)
    - If in *Colab Environment* copy to the running machine from Google Drive
        - datafiles
        - `btc_helpers.py` python file for importing helper classes and functions
- Check availability of CPU/GPU/TPU and get the `strategy` to use for running models


In [2]:
# output folder and file prefix
outputSubfolder = 'btc_cnn'  # easy to manage if you create a subfoder under output
saveFilePrefix = 'btc_cnn_'  # need the 2 underscores for subsequent parsing of attributes ffrom filename 

##----------Check colab environment---------##
colabEnv = False
try:
    from google.colab import drive
    colabEnv=True
    drive.mount('/content/drive')
    print('In Google Colab environment. Mounted drive at /content/drive/MyDrive')
except:
    print('Not in Google Colab environment')
    
##--------------Setup paths----------------##
if colabEnv:
    # make a directory at current working directorys
    pathlib.Path('DataSetBrainTumor').mkdir(parents=True,exist_ok=True)
    # Only copy cdatasets converted to HDF5 files to host drive
    !cp /content/drive/MyDrive/ColabData/07-Capstone/DataSetBrainTumor/*.h5 DataSetBrainTumor/   
    !cp -R /content/drive/MyDrive/DataSciProjects/GL-IDSS-ADSB/07-Capstone/btc .
    
    # Setup Paths for reading dataset and creating output (Google Drive)
    dataPath = "./DataSetBrainTumor"
    outputPath = '/content/drive/MyDrive/MIT-IDSS-Capstone/07-Capstone/notebooks/output'
else:
    # Setup Paths for reading dataset and creating output
    dataPath = '../DataSetBrainTumor' # dir or link to dir for running local
    outputPath = './output' # dir or link to dir (usually under the location of current notebook)

outputPath = os.path.join(outputPath,outputSubfolder)
# Output path for figures, models, and model-tuning
modelPath = os.path.join(outputPath,'models')
figurePath = os.path.join(outputPath,'figures')
modelTunerPath = os.path.join(outputPath,'model-tuner')

# Create output paths if not exist
pathlib.Path(modelPath).mkdir(parents=True,exist_ok=True)
pathlib.Path(figurePath).mkdir(parents=True,exist_ok=True)
pathlib.Path(modelTunerPath).mkdir(parents=True,exist_ok=True)
    
# print data location and files:
print(f'\nContents of data folder [{dataPath}]')
print(os.listdir(dataPath))
# print python src location and files:
print(f'\nContents of python source folder [btc]')
print(os.listdir('btc'))
# print output folder locations
print(f'\nOutput folders [{outputPath}]')
print(os.listdir(outputPath))
print(f'\nPrefix for all saved files {saveFilePrefix}')
print()


Not in Google Colab environment

Contents of data folder [../DataSetBrainTumor]
['Training_256.h5', 'readme.txt~', 'Training', 'Testing', 'readme.txt', 'Testing_256.h5']

Contents of python source folder [btc]
['.DS_Store', 'test_btc_DataUtils.py', '__init__.py', '__pycache__', 'btc_helpers.py']

Output folders [./output/btc_cnn]
['models', 'model-tuner', 'figures']

Prefix for all saved files btc_cnn_



In [3]:
import btc.btc_helpers as btc

strategy = btc.getCPUorGPUorTPUStrategy()


Running on CPU


In [5]:
dataUtil = btc.DataUtil(dataPath,'Training','Testing')
imgSize=256
trainArr,testarr,trainDf,testDf = dataUtil.getTrainTestData(imgSize)
display(btc.getLabelDistributionDf({'train':trainDf['tumorCategory'],'test':testDf['tumorCategory']}))

Updating cache with training and testing datasets
Caching train and test datasets
Reading HDF5 file ../DataSetBrainTumor/Training_256.h5
Reading HDF5 file ../DataSetBrainTumor/Testing_256.h5
Returning cached [Original] training and testing datasets


Unnamed: 0,train,trainFraction,test,testFraction
glioma,829.0,0.287747,100.0,0.248756
meningioma,830.0,0.288094,115.0,0.28607
no_tumor,395.0,0.137105,113.0,0.281095
pituitary,827.0,0.287053,74.0,0.18408
Total,2881.0,1.0,402.0,1.0


In [7]:
# Run local so that the HDF5 file is generated, 
# On Colab it will take a long time unless the data folders are copied to the remote host

# trainArr,testarr,trainDf,testDf = dataUtil.getTrainTestData(150,mergeSplit='all')
# display(btc.getLabelDistributionDf({'train':trainDf['tumorCategory'],'test':testDf['tumorCategory']}))

Updating cache with training and testing datasets
Converting dataset to HDF5 files
wrote file /Users/subravcr/Google Drive/DataSciProjects/GL-IDSS-ADSB/DataSetBrainTumor/Training_150.h5
Reading HDF5 file ../DataSetBrainTumor/Training_150.h5
wrote file /Users/subravcr/Google Drive/DataSciProjects/GL-IDSS-ADSB/DataSetBrainTumor/Testing_150.h5
Reading HDF5 file ../DataSetBrainTumor/Testing_150.h5
Caching train and test datasets
Reading HDF5 file ../DataSetBrainTumor/Training_150.h5
Reading HDF5 file ../DataSetBrainTumor/Testing_150.h5
merging cached training and testing datasets

Splitting ratio for merged dataset is set to 0.20
Returning cached [Merged&Split] training and testing datasets


Unnamed: 0,train,trainFraction,test,testFraction
glioma,718.0,0.27342,211.0,0.321157
meningioma,763.0,0.290556,182.0,0.277017
no_tumor,414.0,0.157654,94.0,0.143075
pituitary,731.0,0.27837,170.0,0.258752
Total,2626.0,1.0,657.0,1.0
