In [0]:
import pandas as pd
import numpy as np

## **Downloading data from Google Drive**

In [0]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import zipfile
from google.colab import drive

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('content/data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
# list of files in Google Drive folder
file_list = drive.ListFile(
    {'q': "'1MsgfnmWPV-Nod0s1ZejYfsvbIwRMKZg_' in parents"}).GetList()

# find data in .zip format and save it
for f in file_list:
  if f['title'] == "severstal-steel-defect-detection.zip":
    fname = os.path.join(local_download_path, f['title'])
    f_ = drive.CreateFile({'id': f['id']})
    f_.GetContentFile(fname)

# extract files from zip to "extracted/" directory, this directory will be 
# used for further data modelling
zip_ref = zipfile.ZipFile(fname, 'r')
zip_ref.extractall(os.path.join(local_download_path, "extracted"))
zip_ref.close()


Define working directories

In [0]:
working_dir = os.path.join(local_download_path, "extracted")

# defining working folders and labels
train_images_folder = os.path.join(working_dir, "train_images")
train_labels_file = os.path.join(working_dir, "train.csv")

test_images_folder = os.path.join(working_dir, "test_images")
test_labels_file = os.path.join(working_dir, "sample_submission.csv")

In [0]:
train_labels = pd.read_csv(train_labels_file)
test_labels = pd.read_csv(test_labels_file)

# **Data preprocessing**

Drop duplicates

In [0]:
train_labels.drop_duplicates("ImageId", keep="last", inplace=True)

Add to the train dataframe all non-defective images, setting None as value of EncodedPixels column

In [0]:
images = os.listdir(train_images_folder)
present_rows = train_labels.ImageId.tolist()
for img in images:
    if img not in present_rows:
        train_labels = train_labels.append({"ImageId" : img, "ClassId" : 1, "EncodedPixels" : None}, 
                                            ignore_index=True)


Change EncodedPixels column, by setting 1 if images is defected and 0 otherwise

In [0]:
for index, row in train_labels.iterrows():
  train_labels.at[index, "EncodedPixels"] = int(train_labels.at[index, "EncodedPixels"] is not None)

In total we got 12,568 training samples

In [0]:
train_labels

Unnamed: 0,ImageId,ClassId,EncodedPixels
0,0002cc93b.jpg,1,1
1,0007a71bf.jpg,3,1
2,000a4bcdd.jpg,1,1
3,000f6bf48.jpg,4,1
4,0014fce06.jpg,3,1
...,...,...,...
12563,e6f273c0f.jpg,1,0
12564,2eb20e316.jpg,1,0
12565,8bcb8ebc4.jpg,1,0
12566,87b5d669b.jpg,1,0


Create data flow using ImageDataGenerator, see example here: https://medium.com/@vijayabhaskar96/tutorial-on-keras-flow-from-dataframe-1fd4493d237c

In [0]:
from keras_preprocessing.image import ImageDataGenerator

def create_datagen():
    return ImageDataGenerator(
        fill_mode='constant',
        cval=0.,
        rotation_range=10,
        height_shift_range=0.1,
        width_shift_range=0.1,
        vertical_flip=True,
        rescale=1./255,
        zoom_range=0.1,
        horizontal_flip=True,
        validation_split=0.15
    )

def create_test_gen():
    return ImageDataGenerator(rescale=1/255.).flow_from_dataframe(
        dataframe=test_labels,
        directory=test_images_folder,
        x_col='ImageId',
        class_mode=None,
        target_size=(256, 512),
        batch_size=1,
        shuffle=False
    )

def create_flow(datagen, subset_name):
    return datagen.flow_from_dataframe(
        dataframe=train_labels,
        directory=train_images_folder,
        x_col='ImageId',
        y_col='EncodedPixels',
        class_mode='other',
        target_size=(256, 512),
        batch_size=32,
        subset=subset_name
    )

In [0]:
data_generator = create_datagen()
train_gen = create_flow(data_generator, 'training')
val_gen = create_flow(data_generator, 'validation')
test_gen = create_test_gen()

Found 10683 validated image filenames.
Found 1885 validated image filenames.
Found 5506 validated image filenames.


# **Building and fiting model**

In [0]:
from keras.applications import InceptionResNetV2
from keras.models import Model
from keras.layers.core import Dense 
from keras.layers.pooling import GlobalAveragePooling2D
from keras import optimizers

In [0]:
model = InceptionResNetV2(weights='imagenet', input_shape=(256,512,3), include_top=False)
#model.load_weights('/kaggle/input/inceptionresnetv2/inception_resent_v2_weights_tf_dim_ordering_tf_kernels_notop.h5')
model.trainable=False

x=model.output
x=GlobalAveragePooling2D()(x)
x=Dense(128,activation='relu')(x)
x=Dense(64,activation='relu')(x) 
out=Dense(1,activation='sigmoid')(x) #final layer binary classifier

model_binary=Model(inputs=model.input,outputs=out) 














Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.7/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [0]:
model_binary.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Fittting the data

In [0]:
STEP_SIZE_TRAIN=train_gen.n//train_gen.batch_size
STEP_SIZE_VALID=val_gen.n//val_gen.batch_size
STEP_SIZE_TEST=test_gen.n//test_gen.batch_size

model_binary.fit_generator(generator=train_gen,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=val_gen,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=15
                    )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fc63935a3c8>

Predicting test labels

In [0]:
test_gen.reset()
pred=model_binary.predict_generator(test_gen,
steps=STEP_SIZE_TEST,
verbose=1)



# **Saving results**

Create dataframe with probalities of having defects for each image

In [0]:
ids = np.array(test_labels.ImageId)
pred = np.array([p[0] for p in pred])
probabilities_df = pd.DataFrame({'ImageId': ids, 'Probability': pred}, columns=['ImageId', 'Probability'])


IndexError: ignored

In [0]:
probabilities_df

Unnamed: 0,ImageId,Probability
0,0000f269f.jpg,0.889025
1,000ccc2ac.jpg,0.823085
2,002451917.jpg,0.944195
3,003c5da97.jpg,0.999999
4,0042e163f.jpg,0.999996
...,...,...
5501,ffc9a6187.jpg,0.273317
5502,ffdb60677.jpg,0.755169
5503,ffe6e2da6.jpg,0.560049
5504,fff4fd9bb.jpg,0.992803


In [0]:
from google.colab import files
df.to_csv('filename.csv') 
files.download('filename.csv')
drive.mount('/content/gdrive') 


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
!cp /content/defect_present_probabilities.csv gdrive/My\ Drive