## Image classification - ResNet50 - 25,000x - with MajVoting - overlap 9blocks

## Import libraries

In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# setup CUDA_VISIBLE DEVICES for titan.sci.utah.edu
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"


In [None]:
#Import libraries

from fastai.imports import *
from fastai.conv_learner import *
from fastai.transforms import *

import numpy as np
import pandas as pd
import seaborn as sns

## I/O and hyper parameters

In [None]:
# Parameters and hyper-parameters

path = '~/Project_SEM/Project_TargetClass/scripts_notebooks/'
csv_all = os.path.join(os.getcwd(),'Dataset_ImageClassification_TargetClass_Overlap-9Blocks.csv')
csv_analysis_fastai = os.path.join(os.getcwd(),'Dataset_ImageClassification_TargetClass_25000x_Overlap-9Blocks_fastai.csv')
# Network architecture
arch = resnet50
# Image size
rsz = 400
sz = 224
# Batch size
bs = 32
# Default learning rate
lr = 0.01

## Dataset overview

In [None]:
# Read csv file and create dataframe
df1 = pd.read_csv(csv_all, sep=',', parse_dates = ['AcquisitionDate'])

df1.shape

In [None]:
df1.head()

In [None]:
df1.groupby(['StartingMaterial']).size()

In [None]:
# Filder dataset
Filter_List = ['25000x']

# Create new dataframe
df2 = df1[df1['Magnification'].isin(Filter_List)]
#df2 = df1
print(df2.shape)

In [None]:
print(pd.value_counts(df2['StartingMaterial']))

In [None]:
# Retrieve individual ImageNb
df2['ImageNb'] = df2['Location'].apply(lambda x: x.split('_')[-4])
df2 = df2.astype({"ImageNb": str})
# Create individual acquisition
df2['Acquisition'] = df2[['Material','StartingMaterial','Magnification','ImageNb']].apply(lambda x: '_'.join(x), axis=1)
df2.head()

In [None]:
# Define distinct classification Label
df2['Label'] = df2[['Material','StartingMaterial']].apply(lambda x: 'from'.join(x), axis=1)


In [None]:
# Save dataframe to CSV file (to be used for fastai - Deep learning)
# Keep only 2 columns: location, and dependent variable 'Label'
df2 = df2[['Location','Label','Acquisition']]

# Shuffle dataset
from sklearn.utils import shuffle
df2 = shuffle(df2, random_state = 1)

df2.to_csv(csv_analysis_fastai, index=False, na_rep = 'NA')

## Analysis

In [None]:
df = pd.read_csv(csv_analysis_fastai)
df.head()

In [None]:
# Generate bar graph
pd.value_counts(df['Label']).sort_index().plot(kind='bar', title = 'Label distribution')
fig1 = plt.gcf()
plt.tight_layout()
plt.xticks(rotation='vertical')
fig1.savefig('BarGraph_Distribution_Label-All-25000x.png')
plt.show()

## Define validation dataset

In [None]:
#Test GroupKFold
from sklearn.model_selection import GroupKFold

groups = df['Acquisition']
group_kfold = GroupKFold(n_splits=5)
KFold_Iteration = 0
for train_index, val_index in group_kfold.split(df['Location'],df['Label'],groups):
    print("\n\nKFold_Iteration", KFold_Iteration)
    print("val_index",val_index)
    print("Length validation dataset: ", len(val_index))
    
    KFold_Iteration += 1
    

In [None]:
df_val = df.iloc[val_index,:]
df_val.groupby(['Label']).size()

In [None]:
pd.value_counts(df_val['Label']).sort_index().plot(kind='bar', title = 'Label distribution - Validation dataset')

## Deep Learning analysis

In [None]:
# Data augmentation
transforms = [RandomRotate(5), RandomLighting(0.05, 0.05), RandomDihedral()]

In [None]:
def get_data(rsz):
    tfms = tfms_from_model(arch,sz,aug_tfms=transforms, crop_type=CropType.RANDOM, max_zoom=1.0)
    return ImageClassifierData.from_csv(path,'data', csv_analysis_fastai, bs=bs, tfms=tfms, val_idxs=val_index, suffix='', test_name='', skip_header=True, num_workers=0)


### Check data augmentation

In [None]:
data = get_data(sz)

In [None]:
#data = data.resize(rsz, 'tmp')

In [None]:
# Validation dataset
list_val = iter(data.val_dl)


In [None]:
x,y=next(list_val)
idx=0

fig,axes = plt.subplots(3,3, figsize=(12,12))
for i,ax in enumerate(axes.flat):
    ima=data.val_ds.denorm(x)[i]
    ax.set_title(data.classes[y[i]])
    ax.imshow(ima)

In [None]:
#Training dataset
list_trn = iter(data.trn_dl)

In [None]:
x,y=next(list_trn)
idx=0

fig,axes = plt.subplots(3,3, figsize=(12,12))
for i,ax in enumerate(axes.flat):
    ima=data.trn_ds.denorm(x)[i]
    ax.set_title(data.classes[y[i]])
    ax.imshow(ima)

### Network

In [None]:
# Main commands to load data and model
learn = ConvLearner.pretrained(arch, data, precompute=False, pretrained=True, ps=[0.25,0.5])


In [None]:
learn

In [None]:
# Find automated learning rate
lrf = learn.lr_find(end_lr=10)

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot(n_skip=2)

In [None]:
lr = 5e-3

In [None]:
learn.fit(lr,5)

In [None]:
learn.fit(lr,10, cycle_len=1)

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save('224_lastlayer_resnet50_Label_All-25000x_Overlap-9Blocks_MajVoting')

In [None]:
learn.load('224_lastlayer_resnet50_Label_All-25000x_Overlap-9Blocks_MajVoting')

In [None]:
learn.unfreeze()
lrs=np.array([lr/9,lr/3,lr])

In [None]:
lrf = learn.lr_find()

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot()

In [None]:
lr=1e-4
lrs=np.array([lr/9,lr/3,lr])
learn.fit(lrs, 5, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save('224_all_resnet50_Label_All-25000x_Overlap-9Blocks_MajVoting')

In [None]:
learn.load('224_all_resnet50_Label_All-25000x_Overlap-9Blocks_MajVoting')

## Inference

In [None]:
#Inference on validation data
log_preds,y = learn.TTA()
probs = np.mean(np.exp(log_preds),0)

accuracy_np(probs,y)

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
from fastai.plots import *
y_pred = np.argmax(probs,1)
cm = confusion_matrix(y,y_pred)
plot_confusion_matrix(cm, data.classes)
fig1 = plt.gcf()
plt.tight_layout()
fig1.savefig('ConfusionMatrix_All-25000x_Overlap-9Blocks.png')
plt.show()

In [None]:
probs

In [None]:
from sklearn.metrics import f1_score
print(f1_score(y, y_pred, average=None))
print(f1_score(y, y_pred, average='micro'))
print(f1_score(y, y_pred, average='macro'))
print(f1_score(y, y_pred, average='weighted'))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y, y_pred, target_names=data.classes))

### Analysis - majority voting

In [None]:
df_val.head()

In [None]:
df_val['TrueLabel'] = y.tolist()
df_val['PredLabel'] = y_pred.tolist()
df_val.head()

In [None]:
df_val.to_csv('./tmp_val.csv', index=False, na_rep = 'NA')

In [None]:
# Majority voting
df_TrueLabel_MajVoting = df_val.groupby(['Acquisition'])['TrueLabel'].apply(lambda x: x.mode()[0]).reset_index(name='TrueLabel_MajVoting')
df_TrueLabel_MajVoting.head(15)

In [None]:
df_val.groupby(['Acquisition'])['PredLabel'].apply(lambda x: x.mode())

In [None]:
df_PredLabel_MajVoting = df_val.groupby(['Acquisition'])['PredLabel'].apply(lambda x: x.mode()[0]).reset_index(name='PredLabel_MajVoting')
df_PredLabel_MajVoting.head(15)

In [None]:
TrueLabel_MajVoting = df_TrueLabel_MajVoting['TrueLabel_MajVoting'].values
print(TrueLabel_MajVoting)
print(TrueLabel_MajVoting.size)

In [None]:
PredLabel_MajVoting = df_PredLabel_MajVoting['PredLabel_MajVoting'].values
PredLabel_MajVoting

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(TrueLabel_MajVoting,PredLabel_MajVoting)

In [None]:
print(classification_report(TrueLabel_MajVoting, PredLabel_MajVoting, target_names=data.classes))

In [None]:
cm2 = confusion_matrix(TrueLabel_MajVoting,PredLabel_MajVoting)
plot_confusion_matrix(cm2, data.classes)
fig2 = plt.gcf()
plt.tight_layout()
fig2.savefig('ConfusionMatrix_All-25000x_Overlap-9Blocks_MajVoting.png')
plt.show()

### Exploratory analysis - variable size / shape information

In [None]:
data.classes

In [None]:
# Validation size: xx images
y.shape

In [None]:
# Predictions shape: 
# - dimension 1: 5 TTA images ( 1 main image + 4 augmented)
# - dimension 2: xx validation images
# - dimension 3: 5 classes
log_preds.shape

In [None]:
log_preds[0].shape

In [None]:
np.exp(log_preds[0])


In [None]:
probs_0 = np.mean(np.exp(log_preds[0]),0)
probs_0

In [None]:
np.argmax(probs[0],0)

In [None]:
y[0]

### Quality Control

In [None]:
y_pred = np.argmax(probs,1)
y_pred

In [None]:
y

In [None]:
# Number of element for Quality control
Nb_elements = 4


In [None]:

def Retrieve_CorrectIndices(TrueLabel, PredictedLabel, ClassNb):
    # Array of correct predictions
    Index_equal = np.equal(TrueLabel, PredictedLabel)
    # Array corresponding to ClassNb of Interest
    TrueLabel_ClassNb = np.equal(TrueLabel, ClassNb)
    # Logical Operator AND
    Output_AND = np.logical_and(Index_equal, TrueLabel_ClassNb)
    # Return list of indices where value = True
    Output = np.where(Output_AND == True)[0]
    # Return 5 random indices only
    Output_rand = np.random.choice(Output, Nb_elements, replace=False)
    
    #print('Index_equal',Index_equal)
    #print('TrueLabel_ClassNb',TrueLabel_ClassNb)
    #print('Output_AND',Output_AND)
    #print('Output',Output)
    #print('Output_rand',Output_rand)
    
    return Output_rand
    
    
    

In [None]:
Retrieve_CorrectIndices(y,y_pred,0)

In [None]:
def Retrieve_InCorrectIndices(TrueLabel, PredictedLabel, ClassNb):
    # Array of correct predictions
    Index_notequal = np.not_equal(TrueLabel, PredictedLabel)
    # Array corresponding to ClassNb of Interest
    TrueLabel_ClassNb = np.equal(TrueLabel, ClassNb)
    # Logical Operator AND
    Output_AND = np.logical_and(Index_notequal, TrueLabel_ClassNb)
    # Return list of indices where value = True
    Output = np.where(Output_AND == True)[0]
    # Return 5 random indices only
    Output_rand = np.random.choice(Output, Nb_elements, replace=False)
    
    #print('Index_notequal',Index_notequal)
    #print('TrueLabel_ClassNb',TrueLabel_ClassNb)
    #print('Output_AND',Output_AND)
    #print('Output',Output)
    #print('Output_rand',Output_rand)
    
    return Output_rand

In [None]:
Retrieve_InCorrectIndices(y,y_pred,0)

In [None]:
# Inputs: idxs- list of 5 random indices, plot title
def plot_Clem(idxs, title):
    print(title)
    #print(idxs)
    #title_probs = [y_pred[x] for x in idxs]
    #print(title_probs)
    fig,axes = plt.subplots(1,Nb_elements, figsize=(20,20))
    for i,ax in enumerate(axes.flat):
        idx = idxs[i]
        plot_title = data.classes[y_pred[idx]]
        ima=open_image(df.at[idx,"Location"])
        ax.set_title(plot_title)
        ax.imshow(ima)

In [None]:
ClassNb = 0
title = 'Correctly classified - Class ' + data.classes[ClassNb]
plot_Clem(Retrieve_CorrectIndices(y,y_pred,ClassNb), title)

In [None]:
ClassNb = 0
title = 'Incorrectly classified - Class ' + data.classes[ClassNb]
plot_Clem(Retrieve_InCorrectIndices(y,y_pred,ClassNb), title)

In [None]:
ClassNb = 1
title = 'Correctly classified - Class ' + data.classes[ClassNb]
plot_Clem(Retrieve_CorrectIndices(y,y_pred,ClassNb), title)

In [None]:
ClassNb = 1
title = 'Incorrectly classified - Class ' + data.classes[ClassNb]
plot_Clem(Retrieve_InCorrectIndices(y,y_pred,ClassNb), title)

In [None]:
ClassNb = 2
title = 'Correctly classified - Class ' + data.classes[ClassNb]
plot_Clem(Retrieve_CorrectIndices(y,y_pred,ClassNb), title)

In [None]:
ClassNb = 2
title = 'Incorrectly classified - Class ' + data.classes[ClassNb]
plot_Clem(Retrieve_InCorrectIndices(y,y_pred,ClassNb), title)

In [None]:
ClassNb = 3
title = 'Correctly classified - Class ' + data.classes[ClassNb]
plot_Clem(Retrieve_CorrectIndices(y,y_pred,ClassNb), title)

In [None]:
ClassNb = 4
title = 'Correctly classified - Class ' + data.classes[ClassNb]
plot_Clem(Retrieve_CorrectIndices(y,y_pred,ClassNb), title)