In [0]:
import pandas as pd
import numpy as np

# **Downloading data from Google Drive**

In [0]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import zipfile
from google.colab import drive

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('content/data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
# list of files in Google Drive folder
file_list = drive.ListFile(
    {'q': "'1MsgfnmWPV-Nod0s1ZejYfsvbIwRMKZg_' in parents"}).GetList()

# find data in .zip format and save it
for f in file_list:
  # read csv file with predictions from previous layer (probability for each 
  #image whether there are any defect on it)
  if f['title'] == "defect_present_probabilities.csv":
    fname = os.path.join(local_download_path, f['title'])
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(fname)

  if f['title'] == "severstal-steel-defect-detection.zip":
    zipname = os.path.join(local_download_path, f['title'])
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(zipname)

    # extract files from zip to "extracted/" directory, this directory will be 
    # used for further data modelling
    zip_ref = zipfile.ZipFile(zipname, 'r')
    zip_ref.extractall(os.path.join(local_download_path, "extracted"))
    zip_ref.close()


Read labels and define working folders

In [0]:
probabilities = pd.read_csv(fname)

In [0]:
working_dir = os.path.join(local_download_path, "extracted")

# defining working folders and labels
train_images_folder = os.path.join(working_dir, "train_images")
train_labels_file = os.path.join(working_dir, "train.csv")

test_images_folder = os.path.join(working_dir, "test_images")
test_labels_file = os.path.join(working_dir, "sample_submission.csv")

In [0]:
train_labels = pd.read_csv(train_labels_file)
test_labels = pd.read_csv(test_labels_file)

# **Data preprocessing**

**Sorting images by probability of defecr presence - if probability is greater than 0.5 it means there is some defect on image. Future predictions will be done only on defective images**

In [0]:
defective_indexes = probabilities[probabilities.Probability > 0.5]['ImageId'].to_list()
defective_test_labels = pd.DataFrame()
for index, row in test_labels.iterrows():
  if row.ImageId in defective_indexes:
    defective_test_labels =  defective_test_labels.append(row, ignore_index=True)

In [57]:
train_labels

Unnamed: 0,ImageId,ClassId,EncodedPixels
0,0002cc93b.jpg,1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0007a71bf.jpg,3,18661 28 18863 82 19091 110 19347 110 19603 11...
2,000a4bcdd.jpg,1,37607 3 37858 8 38108 14 38359 20 38610 25 388...
3,000f6bf48.jpg,4,131973 1 132228 4 132483 6 132738 8 132993 11 ...
4,0014fce06.jpg,3,229501 11 229741 33 229981 55 230221 77 230468...
...,...,...,...
7090,ffcf72ecf.jpg,3,121911 34 122167 101 122422 169 122678 203 122...
7091,fff02e9c5.jpg,3,207523 3 207777 9 208030 15 208283 22 208537 2...
7092,fffe98443.jpg,3,105929 5 106177 14 106424 24 106672 33 106923 ...
7093,ffff4eaa8.jpg,3,16899 7 17155 20 17411 34 17667 47 17923 60 18...


Represent ClassId column as 4 different binary columns for corresponding defect class. It was done for model, as input should be 4 binaries, one for each class.

In [0]:
train_classes = pd.get_dummies(train_labels.ClassId, prefix='Class')
train_labels = pd.concat([train_labels, train_classes], axis=1)
train_labels.drop(['ClassId'], axis=1, inplace=True)

In [59]:
train_labels

Unnamed: 0,ImageId,EncodedPixels,Class_1,Class_2,Class_3,Class_4
0,0002cc93b.jpg,29102 12 29346 24 29602 24 29858 24 30114 24 3...,1,0,0,0
1,0007a71bf.jpg,18661 28 18863 82 19091 110 19347 110 19603 11...,0,0,1,0
2,000a4bcdd.jpg,37607 3 37858 8 38108 14 38359 20 38610 25 388...,1,0,0,0
3,000f6bf48.jpg,131973 1 132228 4 132483 6 132738 8 132993 11 ...,0,0,0,1
4,0014fce06.jpg,229501 11 229741 33 229981 55 230221 77 230468...,0,0,1,0
...,...,...,...,...,...,...
7090,ffcf72ecf.jpg,121911 34 122167 101 122422 169 122678 203 122...,0,0,1,0
7091,fff02e9c5.jpg,207523 3 207777 9 208030 15 208283 22 208537 2...,0,0,1,0
7092,fffe98443.jpg,105929 5 106177 14 106424 24 106672 33 106923 ...,0,0,1,0
7093,ffff4eaa8.jpg,16899 7 17155 20 17411 34 17667 47 17923 60 18...,0,0,1,0


Create data flow using ImageDataGenerator, see example here: https://medium.com/@vijayabhaskar96/tutorial-on-keras-flow-from-dataframe-1fd4493d237c

In [0]:
from keras_preprocessing.image import ImageDataGenerator

def create_datagen():
    return ImageDataGenerator(
        fill_mode='constant',
        cval=0.,
        rotation_range=10,
        height_shift_range=0.1,
        width_shift_range=0.1,
        vertical_flip=True,
        rescale=1./255,
        zoom_range=0.1,
        horizontal_flip=True,
        validation_split=0.15
    )

def create_test_gen():
    return ImageDataGenerator(rescale=1/255.).flow_from_dataframe(
        dataframe=defective_test_labels,
        directory=test_images_folder,
        x_col='ImageId',
        class_mode=None,
        target_size=(256, 512),
        batch_size=1,
        shuffle=False
    )
target_columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4']
def create_flow(datagen, subset_name):
    return datagen.flow_from_dataframe(
        dataframe=train_labels,
        directory=train_images_folder,
        x_col='ImageId',
        y_col=target_columns,
        class_mode='other',
        target_size=(256, 512),
        batch_size=32,
        subset=subset_name
    )

In [61]:
data_generator = create_datagen()
train_gen = create_flow(data_generator, 'training')
val_gen = create_flow(data_generator, 'validation')
test_gen = create_test_gen()

Found 6031 validated image filenames.
Found 1064 validated image filenames.
Found 4185 validated image filenames.


# **Building and fitting model**

In [45]:
from keras.applications import InceptionResNetV2
from keras.models import Sequential
from keras.layers.convolutional import Conv2D
from keras.layers.core import Dense, Dropout, Activation, Flatten 
from keras.layers.pooling import  MaxPooling2D, GlobalAveragePooling2D
from keras.layers import BatchNormalization
from keras import optimizers
from keras.models import Model

Using TensorFlow backend.


In [47]:
model = InceptionResNetV2(weights='imagenet', input_shape=(256,512,3), include_top=False)
#model.load_weights('/kaggle/input/inceptionresnetv2/inception_resent_v2_weights_tf_dim_ordering_tf_kernels_notop.h5')
model.trainable=False

x=model.output
x=GlobalAveragePooling2D()(x)
x=Dense(128,activation='relu')(x)
x=Dense(64,activation='relu')(x) 
out=Dense(4,activation='sigmoid')(x) #final layer binary classifier

model_binary=Model(inputs=model.input,outputs=out) 

























Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.7/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [48]:
model_binary.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )













Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [62]:
STEP_SIZE_TRAIN=train_gen.n//train_gen.batch_size
STEP_SIZE_VALID=val_gen.n//val_gen.batch_size
STEP_SIZE_TEST=test_gen.n//test_gen.batch_size

model_binary.fit_generator(generator=train_gen,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=val_gen,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=15
                    )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fc5bb8ee7b8>

In [66]:
test_gen.reset()
pred=model_binary.predict_generator(test_gen,
steps=STEP_SIZE_TEST,
verbose=1)



In [67]:
pred

array([[9.4000697e-03, 5.9549510e-03, 9.7300577e-01, 3.5858154e-04],
       [1.0531250e-01, 2.8689814e-01, 5.6403512e-01, 1.6555309e-02],
       [2.7801394e-02, 3.4737587e-04, 9.8257363e-01, 1.5181303e-04],
       ...,
       [6.2743455e-02, 3.5491586e-02, 9.2200935e-01, 3.5764277e-03],
       [1.1819035e-02, 4.5695305e-03, 3.5197604e-01, 5.3684664e-01],
       [2.1233857e-03, 5.0365925e-06, 9.9894536e-01, 5.3584576e-05]],
      dtype=float32)

# **Saving the results**

Save probabilities to dataframe

In [92]:
ids = np.array(defective_test_labels.ImageId)
classes_prediction = np.array([p for p in pred])
probabilities_df = pd.DataFrame(columns=['ImageId', 'Class_1', 'Class_2', 'Class_3', 'Class_4'])
for i in range(len(ids)):
  probabilities_df = probabilities_df.append({'ImageId':ids[i], 'Class_1':classes_prediction[i][0], 
                                              'Class_2':classes_prediction[i][1], 'Class_3':classes_prediction[i][2], 
                                              'Class_4':classes_prediction[i][3]}, ignore_index=True)

probabilities_df




Unnamed: 0,ImageId,Class_1,Class_2,Class_3,Class_4
0,0000f269f.jpg,0.009400,5.954951e-03,0.973006,0.000359
1,000ccc2ac.jpg,0.105312,2.868981e-01,0.564035,0.016555
2,002451917.jpg,0.027801,3.473759e-04,0.982574,0.000152
3,003c5da97.jpg,0.007475,5.698264e-03,0.975482,0.002478
4,0042e163f.jpg,0.101232,4.244673e-02,0.640853,0.281787
...,...,...,...,...,...
4180,ffbf79783.jpg,0.017240,5.671635e-02,0.907734,0.004130
4181,ffdb60677.jpg,0.000093,6.854534e-07,0.755652,0.175661
4182,ffe6e2da6.jpg,0.062743,3.549159e-02,0.922009,0.003576
4183,fff4fd9bb.jpg,0.011819,4.569530e-03,0.351976,0.536847


Save dataframe to file and download it.

In [98]:
from google.colab import files
from google.colab import drive
probabilities_df.to_csv('classes_probabilities.csv') 
#files.download('classes_probabilities.csv')
drive.mount('/content/gdrive') 


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
!cp /content/classes_probabilities.csv gdrive/My\ Drive/Datasets