# ResNet50 - multi classification - all data with K-fold

In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# setup CUDA_VISIBLE DEVICES for titan.sci.utah.edu
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


In [None]:
# This file contains all the main external libs we'll use
from fastai.imports import *

#from fastai.transforms import *
from fastai.conv_learner import *
#from fastai.model import *
#from fastai.dataset import *
#from fastai.sgdr import *
#from fastai.plots import *

import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold

# Confusion matrix
from sklearn.metrics import confusion_matrix
from fastai.plots import *

# classification report
from sklearn.metrics import classification_report

In [None]:
# Parameters and hyper-parameters

PATH = '~/Project_SEM/Project_TargetClass//scripts_notebooks_All_CV'
csv_all = os.path.join(PATH,'Dataset_ImageClassification_TargetClass_Filtered.csv')
#csv_analysis = os.path.join(os.getcwd(),'Dataset_ImageClassification_TargetClass_Filtered_All.csv')
csv_analysis_fastai = os.path.join(PATH,'Dataset_ImageClassification_TargetClass_Filtered_All_fastai.csv')
# Network architecture
arch = resnet50
# Image size
rsz = 400
sz = 224
# Batch size
bs = 16
# Default learning rate
lr = 0.01

## Dataset creation for specific analysis

In [None]:
# Read csv file and create dataframe
df1 = pd.read_csv(csv_all, sep=',', parse_dates = ['AcquisitionDate'])
#df['TargetClass'] = df['TargetClass'].astype(str)

df1.shape


In [None]:
pd.value_counts(df1['TargetClass'])

In [None]:
# Filder dataset
#Filter_List = ['Filter1','Filter2','Filter3']

# Create new dataframe
#df2 = df1[df1['TargetClass'].isin(Filter_List)]
#df2.shape
df2 = df1

In [None]:
pd.value_counts(df2['TargetClass'])

In [None]:
# Save dataframe to CSV file (to be used for fastai - Deep learning)
# Keep only 2 columns: location, and dependent variable 'TargetClass'
df2 = df2[['Location','TargetClass']]

# Shuffle dataset
from sklearn.utils import shuffle
df2 = shuffle(df2, random_state = 1)

df2.to_csv(csv_analysis_fastai, index=False, na_rep = 'NA')

In [None]:
pd.value_counts(df2['TargetClass']).sort_index().plot(kind='bar', title = 'TargetClass distribution - Full dataset')

## Learning rate assesment

In [None]:
#label_csv = os.path.join(os.getcwd(),'Dataset_ImageClassification_TargetClass_Filtered1.csv')
n = len(list(open(csv_analysis_fastai)))-1
# Return validation indexes using a 10% split
val_idxs = get_cv_idxs(n,val_pct=0.2)
print('n:',n)
print('Nb val_idxs',len(val_idxs))
print('val_idxs',val_idxs)

In [None]:
label_df = pd.read_csv(csv_analysis_fastai)
label_df.head()

In [None]:
# Data augmentation
transforms = [RandomRotate(5), RandomLighting(0.05, 0.05), RandomDihedral()]

In [None]:
tfms = tfms_from_model(arch,sz,aug_tfms=transforms, crop_type=CropType.RANDOM, max_zoom=1.0)
data = ImageClassifierData.from_csv(PATH,'data', csv_analysis_fastai, bs=bs, tfms=tfms, val_idxs=val_idxs, suffix='', test_name='', skip_header=True, num_workers=0)
learn = ConvLearner.pretrained(arch, data, precompute=False, pretrained=True, ps=[0.25,0.5])

In [None]:
lrf=learn.lr_find()

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot()

In [None]:
lr = 5e-3

TTA_Accuracies = []
CM_Array = []
Report_Array = []
KFold_Iteration = 0

# K-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state = 1)

for train_index, val_index in kf.split(label_df.index):
    print("\n\nKFold_Iteration", KFold_Iteration)
    #print("\ntrain_index",train_index)
    #print("val_index",val_index)
    print("Length validation dataset: ", len(val_index))
    
    tfms = tfms_from_model(arch,sz,aug_tfms=transforms, crop_type=CropType.RANDOM, max_zoom=1.0)
    data = ImageClassifierData.from_csv(PATH,'data', csv_analysis_fastai, bs=bs, tfms=tfms, val_idxs=val_index, suffix='', test_name='', skip_header=True, num_workers=0)
    learn = ConvLearner.pretrained(arch, data, precompute=False, pretrained=True, ps=[0.25,0.5])
    
    print("Optimizing Last layer only...")
    lr = 5e-3
    learn.fit(lr, 5)
    #learn.precompute=False
    learn.fit(lr, 10, cycle_len=1)
    print("\nOptimizing full model...")
    learn.unfreeze()
    lr = 5e-4
    lrs=np.array([lr/9,lr/3,lr])
    learn.fit(lrs, 5, cycle_len=1, cycle_mult=2)
    
    print("TTA inference...")
    log_preds,y = learn.TTA()
    probs = np.mean(np.exp(log_preds),0)
    Accuracy = accuracy_np(probs,y)
    print("Accuracy: ", Accuracy)
    TTA_Accuracies.append(Accuracy)
    
    print(" Confusion Matrix...")
    y_pred = np.argmax(probs,1)
    cm = confusion_matrix(y,y_pred)
    plot_confusion_matrix(cm, data.classes)
    CM_Array.append(cm)
    
    print(" Classification Report...")
    Report = classification_report(y, y_pred, target_names=data.classes,output_dict=True)
    print(Report)
    Report_Array.append(Report)
    
    KFold_Iteration += 1


In [None]:
print("\nResults overview")
TTA_Inference_avg = np.average(TTA_Accuracies)
TTA_Inference_std = np.std(TTA_Accuracies)
print("TTA_Inference_avg",TTA_Inference_avg)
print("TTA_Inference_std",TTA_Inference_std)


In [None]:
learn

In [None]:
CM_final = CM_Array[0] + CM_Array[1] + CM_Array[2] + CM_Array[3] + CM_Array[4]

In [None]:
plot_confusion_matrix(CM_final, data.classes)
fig1 = plt.gcf()
plt.tight_layout()
fig1.savefig('ConfusionMatrix-Total_TargetClass_All_CV.png')
plt.show()

In [None]:
print(Report_Array[0])

In [None]:
print(Report_Array)
