In [None]:
import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm,tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict



In [None]:
# initialize constants
random_state = 1
image_size = 200
k_fold = 4

# obtain training data labels
train_df = pd.read_csv('../input/aptos2019-blindness-detection/train.csv')
images = np.empty((train_df.shape[0], image_size, image_size, 3))
image_path = '../input/aptos-preprocessing/'

In [None]:
# add the training images to an array

for i, image_id in enumerate(tqdm_notebook(train_df['id_code'])):
    images[i, :, :, :] = cv2.imread(image_path+f'{image_id}.png')

In [None]:
# obtain labels for training images
Y = pd.get_dummies(train_df['diagnosis']).values
# convert the labels to an integer
Y_flat=np.zeros(Y.shape[0])
for i in range(Y.shape[0]):
    Y_flat[i]=np.where(Y[i]==1)[0]
    
# resize training images to 2D
X = images.reshape(images.shape[0], images.shape[1]*images.shape[2]*images.shape[3])
# x_train,x_test,y_train,y_test = train_test_split(X,Y_flat,test_size=0.2,random_state=random_state)

In [None]:
# Fit the random forest model
model = RandomForestClassifier(n_estimators=100, min_samples_split=2,random_state=random_state, class_weight="balanced")

In [None]:
# perform 4 fold cross validation and obtain prediction probabilities
y_pred_prob = cross_val_predict(model, X, Y_flat, cv=k_fold, method='predict_proba')


In [None]:
def prob_to_csv(prob, filename):
    '''
    helper function that writes the class prediction probabilities to a csv file
    prob: an array of probabilites for class predictions
    filename: string for file to be name
    '''
    prob_0 = []
    prob_1 = []
    prob_2 = []
    prob_3 = []
    prob_4 = []

    for i in range(prob.shape[0]):
        prob_0.append(prob[i][0])
        prob_1.append(prob[i][1])
        prob_2.append(prob[i][2])
        prob_3.append(prob[i][3])
        prob_4.append(prob[i][4])
    pred_dict={'test':Y_flat,'class0':prob_0, 'class1':prob_1,'class2':prob_2,'class3':prob_3,'class4':prob_4}
    df_pred = pd.DataFrame(pred_dict)
    df_pred.to_csv(f'{filename}.csv',index=False) 

In [None]:
# save the prediction probabilities obtained as a csv
prob_to_csv(y_pred, 'random_forest_prob')