In [3]:
#To import the required libraries
import numpy as np 
import pandas as pd 
import os
import gc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


#To import some features of the tensorflow framework
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.metrics import fbeta_score
from tqdm import tqdm
import cv2
from PIL import Image

#To import some features of Keras that I'll be using later on
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from keras import backend as K

#for model training, I'll import the required libraries from scikitlearn
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import fbeta_score
import time


In [5]:
#next step is to load and read the dataset using pandas
forest_train =pd.read_csv('/kaggle/input/planets-dataset/planet/planet/train_classes.csv')
forest_test =pd.read_csv('../input/planets-dataset/planet/planet/sample_submission.csv')

In [6]:
gc.collect()

70

In [7]:
#let's see the categories or placeholders in  the dataset
flatten = lambda l: [item for sublist in l for item in sublist]
labels_map = list(set(flatten([l.split(' ') for l in forest_train['tags'].values])))
print(labels_map)

['slash_burn', 'water', 'agriculture', 'cultivation', 'cloudy', 'artisinal_mine', 'blow_down', 'bare_ground', 'primary', 'habitation', 'selective_logging', 'road', 'conventional_mine', 'partly_cloudy', 'haze', 'blooming', 'clear']


In [8]:
gc.collect()

20

In [9]:
#assigning numerical values to each label in the form pf dictionary
labels_map = {i:j for j, i in enumerate(labels_map)}
labels_map

{'slash_burn': 0,
 'water': 1,
 'agriculture': 2,
 'cultivation': 3,
 'cloudy': 4,
 'artisinal_mine': 5,
 'blow_down': 6,
 'bare_ground': 7,
 'primary': 8,
 'habitation': 9,
 'selective_logging': 10,
 'road': 11,
 'conventional_mine': 12,
 'partly_cloudy': 13,
 'haze': 14,
 'blooming': 15,
 'clear': 16}

In [10]:
gc.collect()

40

In [11]:
# The next step is encoding of the train and test sets respectively

#Like I always do, I'll begin with the train set

# But first, let's convert the images into pixels and resize them to save memory space

X_train, Y_train = [], []
for img, label in tqdm(forest_train.values, miniters = 1000):
    target = np.zeros(17)
    for tag in label.split(' '):
        target[labels_map[tag]]=1
    X_train.append(cv2.resize(cv2.imread('../input/planets-dataset/planet/planet/train-jpg/{}.jpg'.format(img)), (64,64)))
    Y_train.append(target)

100%|██████████| 40479/40479 [01:51<00:00, 362.45it/s]


In [13]:
gc.collect()

20

In [14]:
#The same procedure will be applied on the test set

X_test=[]
for img, label in tqdm(forest_test[:40669].values, miniters = 1000):
  X_test.append(cv2.resize(cv2.imread('../input/planets-dataset/planet/planet/test-jpg/{}.jpg'.format(img)), (64,64)))
for img, label in tqdm(forest_test[40669:].values, miniters = 1000):
  X_test.append(cv2.resize(cv2.imread('../input/planets-dataset/test-jpg-additional/test-jpg-additional/{}.jpg'.format(img)), (64,64)))

100%|██████████| 40669/40669 [02:13<00:00, 303.93it/s]
100%|██████████| 20522/20522 [01:06<00:00, 307.29it/s]


In [15]:
gc.collect()

40

In [16]:
#Change lists to numpy arrays and normalize

x_train = np.array(X_train, np.float16)/255
y_train = np.array(Y_train, np.uint8)

x_test = np.array(X_test, np.float16)/255

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, shuffle = True, random_state = 1)

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape,x_test.shape)

(32383, 64, 64, 3) (32383, 17) (8096, 64, 64, 3) (8096, 17) (61191, 64, 64, 3)


In [17]:
gc.collect()

40

In [18]:
#Therefore, the CNN model here will have 5 layers as earlier stated by the K-folds

#Save the weights in the paths created thus;

kfold_weights_path = os.path.join('', 'weights_kfold_' + '.h5')


cnn_model = Sequential()

#first layer
cnn_model.add(BatchNormalization(input_shape=(64, 64,3)))
cnn_model.add(Conv2D(32, kernel_size=(3, 3),padding='same', activation='relu'))
cnn_model.add(Conv2D(32, (3, 3), activation='relu'))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))
cnn_model.add(Dropout(0.25))

#second layer
cnn_model.add(Conv2D(64, kernel_size=(3, 3),padding='same', activation='relu'))
cnn_model.add(Conv2D(64, (3, 3), activation='relu'))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))
cnn_model.add(Dropout(0.25))

#third layer
cnn_model.add(Conv2D(128, kernel_size=(3, 3),padding='same', activation='relu'))
cnn_model.add(Conv2D(128, (3, 3), activation='relu'))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))
cnn_model.add(Dropout(0.25))

#fourth layer
cnn_model.add(Conv2D(256, kernel_size=(3, 3),padding='same', activation='relu'))
cnn_model.add(Conv2D(256, (3, 3), activation='relu'))
cnn_model.add(MaxPooling2D(pool_size=(2, 2)))
cnn_model.add(Dropout(0.25))
 
#fifth layer    
cnn_model.add(Flatten())
cnn_model.add(Dense(512, activation='relu'))
cnn_model.add(BatchNormalization())
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(17, activation='sigmoid'))

#defining the metrics to use
def fbeta(y_true, y_pred, threshold_shift=0):
    beta = 2

    # just in case of hipster activation at the final layer
    y_pred = K.clip(y_pred, 0, 1)

        # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred + threshold_shift)

    tp = K.sum(K.round(y_true * y_pred_bin)) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred_bin - y_true, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    beta_squared = beta ** 2
    return (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())
    

#Adding some optimization methods to the model to prevent overfitting
epochs = 20
learn_rate = 0.0001
opt  = optimizers.Adam(lr=learn_rate)
cnn_model.compile(loss='binary_crossentropy',optimizer=opt,metrics=[fbeta])
callbacks = [EarlyStopping(monitor='val_loss', patience=2, verbose=0)]

#Each of the above models will be checked while the one with the best weighed will be saved
cnn_model.fit( x_train,  y_train, validation_data=(x_val, y_val),batch_size=128,verbose=2, epochs=epochs,callbacks=callbacks,shuffle=True)


Epoch 1/20
253/253 - 8s - loss: 0.6223 - fbeta: 0.6190 - val_loss: 0.5443 - val_fbeta: 0.6110
Epoch 2/20
253/253 - 7s - loss: 0.3857 - fbeta: 0.7434 - val_loss: 0.2978 - val_fbeta: 0.6424
Epoch 3/20
253/253 - 7s - loss: 0.2267 - fbeta: 0.7503 - val_loss: 0.1991 - val_fbeta: 0.7187
Epoch 4/20
253/253 - 7s - loss: 0.1789 - fbeta: 0.7575 - val_loss: 0.1712 - val_fbeta: 0.7528
Epoch 5/20
253/253 - 7s - loss: 0.1620 - fbeta: 0.7675 - val_loss: 0.1652 - val_fbeta: 0.7503
Epoch 6/20
253/253 - 7s - loss: 0.1522 - fbeta: 0.7784 - val_loss: 0.1593 - val_fbeta: 0.7428
Epoch 7/20
253/253 - 7s - loss: 0.1466 - fbeta: 0.7845 - val_loss: 0.1515 - val_fbeta: 0.7687
Epoch 8/20
253/253 - 7s - loss: 0.1417 - fbeta: 0.7925 - val_loss: 0.1429 - val_fbeta: 0.7859
Epoch 9/20
253/253 - 7s - loss: 0.1389 - fbeta: 0.7963 - val_loss: 0.1383 - val_fbeta: 0.7848
Epoch 10/20
253/253 - 7s - loss: 0.1356 - fbeta: 0.8006 - val_loss: 0.1337 - val_fbeta: 0.7925
Epoch 11/20
253/253 - 7s - loss: 0.1335 - fbeta: 0.8052 - v

<tensorflow.python.keras.callbacks.History at 0x7f68f4031890>

In [19]:
gc.collect()

1553

In [20]:
#to save the training predictions


yfull_test=[]

p_val = cnn_model.predict(x_val, batch_size = 32, verbose=2)

print(fbeta_score(y_val, np.array(p_val) > 0.2, beta=2, average='samples')) 

        

#to save the test predictions
p_test = cnn_model.predict(x_test, batch_size = 128, verbose=2) 
yfull_test.append(p_test)



253/253 - 1s
0.8805625770763035
479/479 - 4s


In [23]:
#to save the output in a dataframe

output = np.array(yfull_test[0])
for i in range (1,len(yfull_test)):
    output+=np.array(yfull_test[1])
    
output = pd.DataFrame(output,columns = labels_map)

In [24]:
output

Unnamed: 0,slash_burn,water,agriculture,cultivation,cloudy,artisinal_mine,blow_down,bare_ground,primary,habitation,selective_logging,road,conventional_mine,partly_cloudy,haze,blooming,clear
0,0.000734,0.014344,0.008801,0.005340,0.001286,0.000884,0.001113,0.001143,0.998597,0.001263,0.002269,0.005343,0.000724,0.001827,0.001892,0.009549,0.996408
1,0.001851,0.019204,0.013353,0.011787,0.001888,0.001467,0.003185,0.002483,0.998160,0.002391,0.007730,0.008895,0.001455,0.003681,0.001526,0.030904,0.996055
2,0.001449,0.118012,0.210334,0.048729,0.002603,0.000842,0.000801,0.002862,0.996257,0.010249,0.001028,0.095491,0.000993,0.998521,0.001218,0.000988,0.000790
3,0.002254,0.029624,0.041107,0.033083,0.002392,0.002033,0.003395,0.003671,0.998096,0.004154,0.006876,0.013663,0.001789,0.017188,0.002940,0.019493,0.982609
4,0.001468,0.108891,0.127817,0.034724,0.085685,0.001780,0.001406,0.002696,0.917777,0.011657,0.001778,0.045866,0.001427,0.953338,0.003121,0.001774,0.001714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61186,0.000665,0.037252,0.034227,0.007276,0.688053,0.001330,0.000646,0.001181,0.321020,0.003928,0.001017,0.019505,0.000861,0.289842,0.006665,0.001096,0.001308
61187,0.008250,0.511789,0.333226,0.146601,0.004170,0.005375,0.004964,0.023149,0.992693,0.044280,0.021470,0.167179,0.003534,0.006636,0.007162,0.011565,0.990224
61188,0.000964,0.014365,0.010237,0.006310,0.001573,0.001122,0.001673,0.001494,0.998349,0.001601,0.003299,0.006901,0.000955,0.001895,0.001709,0.015232,0.996914
61189,0.001009,0.094948,0.016867,0.003968,0.623400,0.002509,0.001470,0.002722,0.256300,0.003546,0.000761,0.016607,0.002088,0.004845,0.260396,0.001486,0.046965


In [27]:
#to create a list of test predictions
predt =[]
for i in tqdm(range(output.shape[0]), miniters = 1000):
    a = output.loc[[i]]
    a = a.apply(lambda  x:x > 0.2,  axis = 1)
    a = a.transpose()
    a = a.loc [a[i]  == True]
    ' '.join(list(a.index))
    predt.append(' '.join(list(a.index)))

100%|██████████| 61191/61191 [02:03<00:00, 494.10it/s]


In [31]:
#saving the output as a CSV file
forest_test['tags'] = predt
forest_test.to_csv('Output.csv', index= False)