In [11]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import gc 
import matplotlib.pyplot as plt
import cv2 

import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import fbeta_score
from tqdm import tqdm 
from matplotlib.image import imread
from sklearn.model_selection import train_test_split
from keras import optimizers
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, History, TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [7]:
train_label = pd.read_csv("train_v2.csv")
train_label.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [8]:
labels = set()
def splitting_tags(tags):
    [labels.add(tag) for tag in tags.split()]
    
train = train_label.copy()
train['tags'].apply(splitting_tags)
labels = list(labels)
print(labels)

['agriculture', 'conventional_mine', 'blow_down', 'habitation', 'bare_ground', 'clear', 'primary', 'cultivation', 'blooming', 'cloudy', 'selective_logging', 'road', 'artisinal_mine', 'haze', 'partly_cloudy', 'water', 'slash_burn']


In [9]:
for tag in labels:
    train[tag] = train['tags'].apply(lambda x: 1 if tag in x.split() else 0)
    
train['image_name'] = train['image_name'].apply(lambda x: '{}.jpg'.format(x))
train.head()

Unnamed: 0,image_name,tags,agriculture,conventional_mine,blow_down,habitation,bare_ground,clear,primary,cultivation,blooming,cloudy,selective_logging,road,artisinal_mine,haze,partly_cloudy,water,slash_burn
0,train_0.jpg,haze primary,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
1,train_1.jpg,agriculture clear primary water,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2,train_2.jpg,clear primary,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
3,train_3.jpg,clear primary,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,train_4.jpg,agriculture clear habitation primary road,1,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0


In [12]:
columns = list(train.columns[2:])

In [13]:
columns

['agriculture',
 'conventional_mine',
 'blow_down',
 'habitation',
 'bare_ground',
 'clear',
 'primary',
 'cultivation',
 'blooming',
 'cloudy',
 'selective_logging',
 'road',
 'artisinal_mine',
 'haze',
 'partly_cloudy',
 'water',
 'slash_burn']

In [14]:
def fbeta(y_true, y_pred, beta = 2, epsilon = 1e-4):
    beta_squared = beta**2
    
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred, axis = 1)
    fp = tf.reduce_sum(y_pred, axis = 1) - tp
    fn = tf.reduce_sum(y_true, axis = 1) - tp
    
    precision = tp/(tp+fp+epsilon)
    recall = tp/(tp+fn+epsilon)
    
    fb = (1+beta_squared)*precision*recall / (beta_squared*precision+recall+epsilon)
    return fb

In [15]:
def multi_label_acc(y_true, y_pred, epsilon = 1e-4):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred, axis = 1)
    fp = tf.reduce_sum(y_pred, axis = 1) - tp
    fn = tf.reduce_sum(y_true, axis = 1) - tp
    
    y_true = tf.cast(y_true, tf.bool)
    y_pred = tf.cast(y_pred, tf.bool)
        
    tn = tf.reduce_sum(tf.cast(tf.logical_not(y_true), tf.float32)
                       * tf.cast(tf.logical_not(y_pred), tf.float32), axis = 1)
    
    return (tp+tn)/(tp+tn+fp+fn+epsilon)

In [17]:
def build_model():
    model = Sequential()
    model.add(BatchNormalization(input_shape=(128, 128, 3)))
    model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(256, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(17, activation='sigmoid'))

    opt = Adam(learning_rate=1e-4)
    
    model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=[multi_label_acc, fbeta])

    return model

In [19]:
save_best_check_point = ModelCheckpoint(filepath = 'best_model.weights.h5', 
                                        monitor = 'val_fbeta',
                                        mode = 'max',
                                        save_best_only = True,
                                        save_weights_only = True)

In [20]:
train_image_gen = ImageDataGenerator(rescale = 1/255, validation_split = 0.2)

train_generator = train_image_gen.flow_from_dataframe(dataframe=train,
                                                directory ="train-jpg",  
                                                x_col="image_name", y_col=columns, subset="training", 
                                                batch_size=16,seed=2021, shuffle=True, 
                                                class_mode="raw", target_size=(128,128))

val_generator = train_image_gen.flow_from_dataframe(dataframe=train,
                                                directory ="train-jpg",  
                                                x_col="image_name", y_col=columns, subset="validation", 
                                                batch_size=16,seed=2021, shuffle=True, 
                                                class_mode="raw", target_size=(128,128))

Found 32384 validated image filenames.
Found 8095 validated image filenames.


In [21]:
step_train_size = int(np.ceil(train_generator.samples / train_generator.batch_size))
step_val_size = int(np.ceil(val_generator.samples / val_generator.batch_size))

In [22]:
model1 = build_model()

  super().__init__(**kwargs)


In [23]:
model1.summary()

In [26]:
model1.fit(x = train_generator, 
           steps_per_epoch = step_train_size, 
           validation_data = val_generator, 
           validation_steps = step_val_size,epochs = 15, 
           callbacks=[save_best_check_point])

Epoch 1/15
[1m2024/2024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2084s[0m 1s/step - fbeta: 0.6888 - loss: 0.2226 - multi_label_acc: 0.9151 - val_fbeta: 0.7840 - val_loss: 0.1565 - val_multi_label_acc: 0.9395
Epoch 2/15


  self.gen.throw(typ, value, traceback)


AttributeError: 'NoneType' object has no attribute 'items'

In [27]:
model2 = build_model()

In [28]:
model2.load_weights('best_model.weights.h5')

  error_msgs[id(saveable)] = saveable, e


In [29]:
sample_submission = pd.read_csv('sample_submission_v2.csv')
sample_submission1 = sample_submission.copy()
sample_submission1['image_name'] = sample_submission1['image_name'].apply(lambda x: '{}.jpg'.format(x))
sample_submission1.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,primary clear agriculture road water
1,test_1.jpg,primary clear agriculture road water
2,test_2.jpg,primary clear agriculture road water
3,test_3.jpg,primary clear agriculture road water
4,test_4.jpg,primary clear agriculture road water


In [30]:
test_df = sample_submission1.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)
test_df.head()

Unnamed: 0,image_name
0,test_0.jpg
1,test_1.jpg
2,test_2.jpg
3,test_3.jpg
4,test_4.jpg


In [31]:
test_image_gen = ImageDataGenerator(rescale = 1/255)

test_generator = test_image_gen.flow_from_dataframe(dataframe=test_df, 
                                                directory="test-jpg", 
                                                x_col="image_name", 
                                                y_col=None, 
                                                batch_size=16, 
                                                shuffle=False, 
                                                class_mode=None, 
                                                target_size=(128,128))

step_test_size = int(np.ceil(test_generator.samples/test_generator.batch_size))

Found 40669 validated image filenames.


In [32]:
test_generator.reset()
pred = model2.predict(test_generator, steps = step_test_size, verbose = 1)

  def workers(self):


[1m2542/2542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m852s[0m 335ms/step


In [33]:
file_names = test_generator.filenames

pred_tags = pd.DataFrame(pred)
pred_tags = pred_tags.apply(lambda x: ' '.join(np.array(labels)[x>0.5]), axis = 1)

result1 = pd.DataFrame({'image_name': file_names, 'tags': pred_tags})
result1.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,clear primary
1,test_1.jpg,clear primary
2,test_2.jpg,primary partly_cloudy
3,test_3.jpg,clear primary
4,test_4.jpg,primary partly_cloudy


In [34]:
add_test_df = sample_submission1.iloc[40669:]['image_name'].reset_index().drop('index', axis =1)
add_test_df.head()

Unnamed: 0,image_name
0,file_0.jpg
1,file_1.jpg
2,file_10.jpg
3,file_100.jpg
4,file_1000.jpg


In [35]:
add_test_generator = test_image_gen.flow_from_dataframe(dataframe = add_test_df, 
                                                    directory ="test-jpg-additional", 
                                                    x_col="image_name", 
                                                    y_col=None, 
                                                    batch_size=16, 
                                                    shuffle=False, 
                                                    class_mode=None, 
                                                    target_size=(128,128))

step_test_size2 = int(np.ceil(add_test_generator.samples/add_test_generator.batch_size))

Found 20522 validated image filenames.


In [36]:
add_test_generator.reset()
add_pred = model2.predict(add_test_generator, steps = step_test_size2, verbose = 1)

  def workers(self):


[1m1283/1283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m687s[0m 535ms/step


In [37]:
file_names = add_test_generator.filenames

add_pred_tags = pd.DataFrame(add_pred)
add_pred_tags = add_pred_tags.apply(lambda x: ''.join(np.array(labels)[x>0.5]), axis = 1)

result2 = pd.DataFrame({'image_name': file_names, 'tags': add_pred_tags})
result2.head()

Unnamed: 0,image_name,tags
0,file_0.jpg,clearprimary
1,file_1.jpg,agricultureclearprimaryroad
2,file_10.jpg,primary
3,file_100.jpg,clearprimary
4,file_1000.jpg,clearprimary


In [38]:
last_result = pd.concat([result1, result2])

last_result = last_result.reset_index().drop('index', axis =1)

print(last_result.shape)
last_result.head()

(61191, 2)


Unnamed: 0,image_name,tags
0,test_0.jpg,clear primary
1,test_1.jpg,clear primary
2,test_2.jpg,primary partly_cloudy
3,test_3.jpg,clear primary
4,test_4.jpg,primary partly_cloudy


In [39]:
last_result['image_name'] = last_result['image_name'].apply(lambda x: x[:-4])
last_result.head()

Unnamed: 0,image_name,tags
0,test_0,clear primary
1,test_1,clear primary
2,test_2,primary partly_cloudy
3,test_3,clear primary
4,test_4,primary partly_cloudy


In [40]:
last_result.to_csv('submission1.csv', index = False)