In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import tensorflow as tf
import keras
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.layers import GlobalMaxPooling2D, GlobalAveragePooling2D, Dense, Conv2D
from keras.models import Model
import os
import matplotlib.image as mpimg 
import matplotlib.pyplot as plt 
import pandas as pd
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
def show_batch(image_batch, label_batch, k=5):
  plt.figure(figsize=(12,12))
  for n in range(k**2):
      ax = plt.subplot(k,k,n+1)
      plt.imshow(image_batch[n])
      plt.title(labels[label_batch[n]==1][0])
      plt.axis('off')
        
def categorical_to_index(y):
    # turn one-hot matrix back to single labels 
    return np.tile(np.arange(len(y[0])), (len(y),1))[y==1]


def k_hold_data(image_batch, label_batch, cutoff = [0, 0.2, 0.4, 0.6, 0.8, 1]):
    # k-fold
    n = len(cutoff)-1 
    Images = []; Labels = []
    for i in range(1, len(cutoff)):
        left = np.int(cutoff[i-1]*len(filenames))
        right = np.int(cutoff[i]*len(filenames))
        Images.append(image_batch[left:right+1])
        Labels.append(label_batch[left:right+1])
    Images = np.array(Images)
    Labels = np.array(Labels)

    Train_X = []; Train_Y = []; Val_X = []; Val_Y = []
    for i in range(n):
        idx = np.arange(n).tolist()
        idx.remove(i)
        Val_X.append(Images[i])
        Val_Y.append(Labels[i])
        Train_X.append(np.concatenate(Images[idx]))
        Train_Y.append(np.concatenate(Labels[idx]))
    return Train_X, Train_Y, Val_X, Val_Y 

### Non-NN classifiers Performance 

In [None]:
input_path = "/kaggle/input/4771-sp20-covid"
train_dir = input_path+"/train/train"

# read y 
with open(input_path+"/train.csv", "r") as f:
    train_y=pd.read_csv(f)["label"]

labels = np.unique(train_y)
# labels

mapping = {}
for i in range(len(labels)):
    mapping[labels[i]] = i
    
train_y = train_y.apply(lambda x: mapping[x])
train_y = keras.utils.to_categorical(train_y)
# train_y


# read x
filenames = os.listdir(train_dir)
h, w = 200, 200  
train_x = np.full((len(filenames),h,w,3),np.nan)
 
for i in range(len(filenames)):
    filename = filenames[i]
    file = os.path.join(train_dir, filename)
    pic = keras.preprocessing.image.load_img(file, grayscale=0, color_mode='rgb', target_size=(h,w))
#     plt.imshow(pic)
    
    # note the images are not ordered
    order = np.int(filename.split(".")[0].split("-")[-1])
    train_x[order] = np.array(pic)
    
    
# image generator 
image_generator = keras.preprocessing.image.ImageDataGenerator(rescale=1/255)   # will do validation_split in model.fit
train_data_generator = image_generator.flow(train_x, train_y, batch_size=len(filenames))
# image_batch, label_batch = next(train_data_generator)
# image_batch.shape
# # data, h, w, channel 

In [None]:
image_batch, label_batch = next(train_data_generator)
image_batch = np.array([np.ravel(image_batch[i]) for i in range(len(image_batch))])
label_batch = categorical_to_index(label_batch)
Train_X, Train_Y, Val_X, Val_Y  = k_hold_data(image_batch, label_batch) 

In [None]:
# try Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
print("Naive Bayes accuracy")
NB_accu = []
for i in range(len(Train_X)):
    train_x, train_y, val_x, val_y = Train_X[i], Train_Y[i], Val_X[i], Val_Y[i]
    val_y_pred = gnb.fit(train_x, train_y).predict(val_x)
    NB_accu.append(np.sum(val_y==val_y_pred)/len(val_y))
print(np.mean(NB_accu))

In [None]:
# try SVM
from sklearn.svm import SVC
clf = SVC(kernel='rbf')
print("SVM accuracy")
SVM_accu = []
for i in range(len(Train_X)):
    train_x, train_y, val_x, val_y = Train_X[i], Train_Y[i], Val_X[i], Val_Y[i]
    clf.fit(train_x, train_y)
    val_y_pred = clf.predict(val_x)
    SVM_accu.append(np.sum(val_y==val_y_pred)/len(val_y))
print(np.mean(SVM_accu))

In [None]:
# try KNN classifier 
from sklearn.neighbors import KNeighborsClassifier

KNN_accu = []
print("KNN accuracy")
k=[3,5,7,10,15,25,50,75,100,125,150,175,200,250,300,350,400]
for n in k:
    accu = []
    for i in range(len(Train_X)):
        train_x, train_y, val_x, val_y = Train_X[i], Train_Y[i], Val_X[i], Val_Y[i]
        neigh = KNeighborsClassifier(n_neighbors=n, algorithm='kd_tree')
        neigh.fit(train_x, train_y)
        val_y_pred = neigh.predict(val_x)
        accu.append(np.sum(val_y==val_y_pred)/len(val_y))
    KNN_accu.append(np.mean(accu))
    print(KNN_accu[-1])

In [None]:
plt.plot(k, KNN_accu)
plt.title("KNN accuracy")
plt.xlabel("n_neigh")
plt.ylabel("accuracy")

### CNN Model Performance  

In [None]:
input_path = "/kaggle/input/4771-sp20-covid"
train_dir = input_path+"/train/train"

# read y 
with open(input_path+"/train.csv", "r") as f:
    train_y=pd.read_csv(f)["label"]

labels = np.unique(train_y)
# labels

mapping = {}
for i in range(len(labels)):
    mapping[labels[i]] = i
    
train_y = train_y.apply(lambda x: mapping[x])
train_y = keras.utils.to_categorical(train_y)
# train_y


# read x
filenames = os.listdir(train_dir)
h, w = 200, 200  
train_x = np.full((len(filenames),h,w,3),np.nan)
 
for i in range(len(filenames)):
    filename = filenames[i]
    file = os.path.join(train_dir, filename)
    pic = keras.preprocessing.image.load_img(file, grayscale=0, color_mode='rgb', target_size=(h,w))
#     plt.imshow(pic)
    
    # note the images are not ordered
    order = np.int(filename.split(".")[0].split("-")[-1])
    train_x[order] = np.array(pic)
    
    
# image generator 
permu = np.random.permutation(len(filenames))
cutoff = np.int(0.8*len(filenames))
image_generator = keras.preprocessing.image.ImageDataGenerator(1/255)   
train_data_generator = image_generator.flow(train_x[permu][:cutoff], train_y[permu][:cutoff], batch_size=len(filenames))

In [None]:
image_batch, label_batch = next(train_data_generator)
show_batch(image_batch, label_batch)

In [None]:
VGG19_MODEL = keras.applications.VGG19(include_top=False, weights='imagenet', classes=len(labels))
for l in VGG19_MODEL.layers:
    l.trainable = False   # use pretrained weights  


y = VGG19_MODEL.output

y = GlobalMaxPooling2D()(y)

y = Dense(256, activation='relu')(y)

y = Dense(128, activation='relu')(y)

y = Dense(56, activation='relu')(y)

y = Dense(len(labels), activation='softmax')(y)

model = Model(input=VGG19_MODEL.input, output=y)

model.compile(optimizer='RMSprop', loss='categorical_crossentropy',
#               metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
image_batch, label_batch = next(train_data_generator)
# print(image_batch.shape)
model.fit(image_batch, label_batch, epochs=200, batch_size=64, validation_split=0.1, shuffle=True)
# 200 

In [None]:
# error analysis 
val_x, val_y = train_x[permu][cutoff:], train_y[permu][cutoff:]
val_y = categorical_to_index(val_y)
val_y_pred = np.argmax(model.predict(val_x), axis=1)

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
method = None
precision = precision_score(val_y, val_y_pred, average=method)
recall = recall_score(val_y, val_y_pred, average=method)
f1 = f1_score(val_y, val_y_pred, average=method)

df = pd.DataFrame(index = ['bacterial', 'covid', 'normal', 'viral'])
df["precision"] = precision
df["recall"] = recall
df["f1 score"] = f1
df

In [None]:
# confusion matrix
c = confusion_matrix(val_y, val_y_pred)
df = pd.DataFrame(c, index = ['bacterial', 'covid', 'normal', 'viral'], columns=['bacterial', 'covid', 'normal', 'viral'])
df

### If not preprocessing images:

In [None]:
# # if not rescaling the images 

# with open(input_path+"/train.csv", "r") as f:
#     train_y=pd.read_csv(f)["label"]

# labels = np.unique(train_y)

# mapping = {}
# for i in range(len(labels)):
#     mapping[labels[i]] = i
    
# train_y = train_y.apply(lambda x: mapping[x])
# train_y = keras.utils.to_categorical(train_y)



# filenames = os.listdir(train_dir)
# h, w = 200, 200  
# train_x = np.full((len(filenames),h,w,3),np.nan)
 
# for i in range(len(filenames)):
#     filename = filenames[i]
#     file = os.path.join(train_dir, filename)
#     pic = keras.preprocessing.image.load_img(file, grayscale=0, color_mode='rgb', target_size=(h,w))
#     order = np.int(filename.split(".")[0].split("-")[-1])
#     train_x[order] = np.array(pic)
    
    
    
# permu = np.random.permutation(len(filenames))
# cutoff = np.int(0.8*len(filenames))
# image_generator = keras.preprocessing.image.ImageDataGenerator()   # no rescaling 
# train_data_generator = image_generator.flow(train_x[permu][:cutoff], train_y[permu][:cutoff], batch_size=len(filenames))




# VGG19_MODEL = keras.applications.VGG19(include_top=False, weights='imagenet', classes=len(labels))
# for l in VGG19_MODEL.layers:
#     l.trainable = False   

# y = VGG19_MODEL.output
# y = GlobalMaxPooling2D()(y)
# y = Dense(256, activation='relu')(y)
# y = Dense(128, activation='relu')(y)
# y = Dense(56, activation='relu')(y)
# y = Dense(len(labels), activation='softmax')(y)
# model = Model(input=VGG19_MODEL.input, output=y)
# model.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['accuracy'])

# image_batch, label_batch = next(train_data_generator)
# model.fit(image_batch, label_batch, epochs=200, batch_size=64, validation_split=0.2, shuffle=True)

# val_x, val_y = train_x[cutoff:], train_y[cutoff:]
# val_y = categorical_to_index(val_y)
# val_y_pred = np.argmax(model.predict(val_x), axis=1)
# print('accuracy:', np.sum(val_y==val_y_pred)/len(val_y))

In [None]:
# # if not resizing the images 

# input_path = "/kaggle/input/4771-sp20-covid"
# train_dir = input_path+"/train/train"

# with open(input_path+"/train.csv", "r") as f:
#     train_y=pd.read_csv(f)["label"]

# labels = np.unique(train_y)

# mapping = {}
# for i in range(len(labels)):
#     mapping[labels[i]] = i
    
# train_y = train_y.apply(lambda x: mapping[x])
# train_y = keras.utils.to_categorical(train_y)



# filenames = os.listdir(train_dir)
# h, w = 600,600
# train_x = [[] for i in range(len(filenames))]
 
# for i in range(len(filenames)):
#     filename = filenames[i]
#     file = os.path.join(train_dir, filename)
#     pic = keras.preprocessing.image.load_img(file, grayscale=0, color_mode='rgb', target_size=(h,w))
#     order = np.int(filename.split(".")[0].split("-")[-1])
#     train_x[order] = np.array(pic)
# train_x = np.array(train_x)    
    

# permu = np.random.permutation(len(filenames))
# cutoff = np.int(0.8*len(filenames))
# image_generator = keras.preprocessing.image.ImageDataGenerator(1/255)   
# train_data_generator = image_generator.flow(train_x[permu][:cutoff], train_y[permu][:cutoff], batch_size=len(filenames))




# VGG19_MODEL = keras.applications.VGG19(include_top=False, weights='imagenet', classes=len(labels))
# for l in VGG19_MODEL.layers:
#     l.trainable = False   

# y = VGG19_MODEL.output
# y = GlobalMaxPooling2D()(y)
# y = Dense(256, activation='relu')(y)
# y = Dense(128, activation='relu')(y)
# y = Dense(56, activation='relu')(y)
# y = Dense(len(labels), activation='softmax')(y)
# model = Model(input=VGG19_MODEL.input, output=y)
# model.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['accuracy'])

# image_batch, label_batch = next(train_data_generator)
# model.fit(image_batch, label_batch, epochs=200, batch_size=64, validation_split=0.2, shuffle=True)

# val_x, val_y = train_x[cutoff:], train_y[cutoff:]
# val_y = categorical_to_index(val_y)
# val_y_pred = np.argmax(model.predict(val_x), axis=1)
# print('accuracy:', np.sum(val_y==val_y_pred)/len(val_y))

### Predict on Test Set 

In [None]:
test_dir = input_path+"/test/test"
filenames = os.listdir(test_dir)
test_x = np.full((len(filenames),h,w,3),np.nan)
 
for i in range(len(filenames)):
    filename = filenames[i]
    file = os.path.join(test_dir, filename)
    pic = keras.preprocessing.image.load_img(file, grayscale=0, color_mode='rgb', target_size=(h,w))
    order = np.int(filename.split(".")[0].split("-")[-1])
    test_x[order] = np.array(pic)
test_data_generator = image_generator.flow(test_x, y=None, batch_size=len(filenames), shuffle=False)
image_batch = next(test_data_generator)

In [None]:
pred = model.predict(image_batch)
pred = np.argmax(pred, axis=1)

In [None]:
inverse_mapping = {}
for i in range(len(labels)):
    inverse_mapping[i] = labels[i]

y = pd.DataFrame(columns=['Id','label'])
y['Id'] = np.arange(len(pred))
for i in range(len(pred)):
    y.iloc[i,1] = inverse_mapping[pred[i]]
y.to_csv("submission.csv", index=None)