#  Classification - ResNet50 - All with 5-fold analysis and majority voting - overlap 9blocks

In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# setup CUDA_VISIBLE DEVICES for titan.sci.utah.edu
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0'


In [None]:
# This file contains all the main external libs we'll use
from fastai.imports import *

#from fastai.transforms import *
from fastai.conv_learner import *
#from fastai.model import *
#from fastai.dataset import *
#from fastai.sgdr import *
#from fastai.plots import *

import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# Confusion matrix
from sklearn.metrics import confusion_matrix
from fastai.plots import *

In [None]:
# Parameters and hyper-parameters

PATH = '~/Project_SEM/Project_TargetClass/Analysis_All/scripts_Clem_notebooks_CV'
csv_all = os.path.join(PATH,'Dataset_TargetClass_Overlap-9Blocks.csv')
#csv_analysis = os.path.join(os.getcwd(),'Dataset_ImageClassification_TargetClass_Overlap-9Blocks_Filtered_All.csv')
csv_analysis_fastai = os.path.join(PATH,'Dataset_TargetClass_Overlap-9Blocks_Filtered_All_fastai.csv')
# Network architecture
arch = resnet50
# Image size
rsz = 400
sz = 224
# Batch size
bs = 64
# Default learning rate
lr = 0.01

## Dataset creation for specific analysis

In [None]:
# Read csv file and create dataframe
df1 = pd.read_csv(csv_all, sep=',', parse_dates = ['AcquisitionDate'])
#df['Impurities'] = df['Impurities'].astype(str)

df1.shape


In [None]:
df1.head()

In [None]:
pd.value_counts(df1['StartingMaterial'])

In [None]:
# Filder dataset
#Filter_List = ['Filter1','Filter2','Filter3']

# Create new dataframe
#df2 = df1[df1['FilterTag'].isin(Filter_List)]
df2 = df1
df2.shape

In [None]:
pd.value_counts(df2['StartingMaterial'])

In [None]:
# Retrieve individual ImageNb
df2['ImageNb'] = df2['Location'].apply(lambda x: x.split('_')[-4])
df2 = df2.astype({"ImageNb": str})
# Create individual acquisition
df2['Acquisition'] = df2[['Material','StartingMaterial','Magnification','ImageNb']].apply(lambda x: '_'.join(x), axis=1)
df2.head()

In [None]:
# Define distinct classification Label
df2['Label'] = df2[['Material','StartingMaterial']].apply(lambda x: 'from'.join(x), axis=1)


In [None]:
# Save dataframe to CSV file (to be used for fastai - Deep learning)
# Keep only 2 columns: location, and dependent variable 'Label'
df2 = df2[['Location','Label','Acquisition']]

# Shuffle dataset
from sklearn.utils import shuffle
df2 = shuffle(df2, random_state = 1)

df2.to_csv(csv_analysis_fastai, index=False, na_rep = 'NA')

## Analysis

In [None]:
label_df = pd.read_csv(csv_analysis_fastai)
label_df.head()

In [None]:
pd.value_counts(label_df['Label']).sort_index().plot(kind='bar', title = 'Label distribution - Full dataset')

In [None]:
# Generate bar graph
pd.value_counts(label_df['Label']).sort_index().plot(kind='bar', title = 'Label distribution')
fig1 = plt.gcf()
plt.tight_layout()
plt.xticks(rotation='vertical')
fig1.savefig('BarGraph_Distribution_Label-All.png')
plt.show()

In [None]:
def MajorityVoting(df_val, y, y_pred):
    df_val['TrueLabel'] = y.tolist()
    df_val['PredLabel'] = y_pred.tolist()

    # Majority voting
    df_TrueLabel_MajVoting = df_val.groupby(['Acquisition'])['TrueLabel'].apply(lambda x: x.mode()[0]).reset_index(name='TrueLabel_MajVoting')
    df_PredLabel_MajVoting = df_val.groupby(['Acquisition'])['PredLabel'].apply(lambda x: x.mode()[0]).reset_index(name='PredLabel_MajVoting')

    TrueLabel_MajVoting = df_TrueLabel_MajVoting['TrueLabel_MajVoting'].values
    PredLabel_MajVoting = df_PredLabel_MajVoting['PredLabel_MajVoting'].values

    return TrueLabel_MajVoting, PredLabel_MajVoting

In [None]:
# Data augmentation
transforms = [RandomRotate(5), RandomLighting(0.05, 0.05), RandomDihedral()]

In [None]:
lr = 5e-3

TTA_Accuracies = []
CM_Array = []
Report_Array = []

TTA_Accuracies_MajVoting = []
CM_Array_MajVoting = []
Report_Array_MajVoting = []

KFold_Iteration = 0

# KFold by group
from sklearn.model_selection import GroupKFold
groups = label_df['Acquisition']
group_kfold = GroupKFold(n_splits=5)

for train_index, val_index in group_kfold.split(label_df['Location'],label_df['Label'],groups):
    print("\n\nKFold_Iteration", KFold_Iteration)
    #print("\ntrain_index",train_index)
    #print("val_index",val_index)
    print("Length validation dataset: ", len(val_index))
    
    tfms = tfms_from_model(arch,sz,aug_tfms=transforms, crop_type=CropType.RANDOM, max_zoom=1.0)
    data = ImageClassifierData.from_csv(PATH,'data', csv_analysis_fastai, bs=bs, tfms=tfms, val_idxs=val_index, suffix='', test_name='', skip_header=True, num_workers=2)
    learn = ConvLearner.pretrained(arch, data, precompute=False, pretrained=True, ps=[0.25,0.5])
    
    print("Optimizing Last layer only...")
    lr = 5e-3
    learn.fit(lr, 5)
    #learn.precompute=False
    learn.fit(lr, 10, cycle_len=1)
    print("\nOptimizing full model...")
    learn.unfreeze()
    lr = 5e-4
    lrs=np.array([lr/9,lr/3,lr])
    learn.fit(lrs, 5, cycle_len=1, cycle_mult=2)
    
    print("TTA inference...")
    log_preds,y = learn.TTA()
    probs = np.mean(np.exp(log_preds),0)
    Accuracy = accuracy_np(probs,y)
    print("Accuracy: ", Accuracy)
    TTA_Accuracies.append(Accuracy)
    
    print(" Confusion Matrix...")
    y_pred = np.argmax(probs,1)
    cm = confusion_matrix(y,y_pred)
    plot_confusion_matrix(cm, data.classes)
    CM_Array.append(cm)
    
    print(" Classification Report...")
    Report = classification_report(y, y_pred, target_names=data.classes,output_dict=True)
    print(Report)
    Report_Array.append(Report)
    
    # Majority voting
    df_val = label_df.iloc[val_index,:]
    TrueLabel_MajVoting, PredLabel_MajVoting = MajorityVoting(df_val, y, y_pred)
    Accuracy_MajVoting = accuracy_score(TrueLabel_MajVoting,PredLabel_MajVoting)
    print("Accuracy_MajVoting: ", Accuracy_MajVoting)
    print(" Confusion Matrix - MajVoting...")
    cm_MajVoting = confusion_matrix(TrueLabel_MajVoting,PredLabel_MajVoting)
    plot_confusion_matrix(cm_MajVoting, data.classes)
    print(" Classification Report - MajVoting...")
    Report_MajVoting = classification_report(TrueLabel_MajVoting, PredLabel_MajVoting, target_names=data.classes,output_dict=True)
    print(Report_MajVoting)
    
    TTA_Accuracies_MajVoting.append(Accuracy_MajVoting)
    CM_Array_MajVoting.append(cm_MajVoting)
    Report_Array_MajVoting.append(Report_MajVoting)
    
    KFold_Iteration += 1


In [None]:
print("\nResults overview")
TTA_Inference_avg = np.average(TTA_Accuracies)
TTA_Inference_std = np.std(TTA_Accuracies)
print("TTA_Inference_avg",TTA_Inference_avg)
print("TTA_Inference_std",TTA_Inference_std)


In [None]:
learn

In [None]:
CM_final = CM_Array[0] + CM_Array[1] + CM_Array[2] + CM_Array[3] + CM_Array[4]

In [None]:
plot_confusion_matrix(CM_final, data.classes)
fig1 = plt.gcf()
plt.xticks(rotation='vertical')
plt.tight_layout()
fig1.savefig('ConfusionMatrix-Total_All_CV_Overlap-9Blocks.png')
plt.show()

## Majority Voting

In [None]:
print("\nResults overview - Majority Voting")
TTA_Inference_avg_MajVoting = np.average(TTA_Accuracies_MajVoting)
TTA_Inference_std_MajVoting = np.std(TTA_Accuracies_MajVoting)
print("TTA_Inference_avg",TTA_Inference_avg_MajVoting)
print("TTA_Inference_std",TTA_Inference_std_MajVoting)

In [None]:
CM_final_MajVoting = CM_Array_MajVoting[0] + CM_Array_MajVoting[1] + CM_Array_MajVoting[2] + CM_Array_MajVoting[3] + CM_Array_MajVoting[4]

In [None]:
plot_confusion_matrix(CM_final_MajVoting, data.classes)
fig2 = plt.gcf()
plt.xticks(rotation='vertical')
plt.tight_layout()
fig2.savefig('ConfusionMatrix-Total_All_CV_Overlap-9Blocks_MajVoting.png')
plt.show()

In [None]:
print(Report_Array_MajVoting[0])

In [None]:
print(Report_Array_MajVoting[0]['weighted avg']['f1-score'])

In [None]:
Report_Array_MajVoting_f1score = [d['weighted avg']['f1-score'] for d in Report_Array_MajVoting]
print(Report_Array_MajVoting_f1score)

MajVoting_f1score_avg = np.average(Report_Array_MajVoting_f1score)
MajVoting_f1score_std = np.std(Report_Array_MajVoting_f1score)
print("MajVoting_f1score_avg: ",MajVoting_f1score_avg)
print("MajVoting_f1score_std: ",MajVoting_f1score_std)

In [None]:
Report_Array_MajVoting_precision = [d['weighted avg']['precision'] for d in Report_Array_MajVoting]
print(Report_Array_MajVoting_precision)

MajVoting_precision_avg = np.average(Report_Array_MajVoting_precision)
MajVoting_precision_std = np.std(Report_Array_MajVoting_precision)
print("MajVoting_precision_avg: ",MajVoting_precision_avg)
print("MajVoting_precision_std: ",MajVoting_precision_std)

In [None]:
Report_Array_MajVoting_recall = [d['weighted avg']['recall'] for d in Report_Array_MajVoting]
print(Report_Array_MajVoting_recall)

MajVoting_recall_avg = np.average(Report_Array_MajVoting_recall)
MajVoting_recall_std = np.std(Report_Array_MajVoting_recall)
print("MajVoting_recall_avg: ",MajVoting_recall_avg)
print("MajVoting_recall_std: ",MajVoting_recall_std)

In [None]:
for i,report in enumerate(Report_Array_MajVoting):
  print("\n Iteration: ",i)
  print(report)