In [4]:
import pandas as pd
import numpy as np
import cv2
import os
import time
import matplotlib.pyplot as plt
%matplotlib inline

base_path = '/home/isdgenomics/users/dipamcha/kaggle/severstal-steel-defect-detection/data/'

train_df = pd.read_csv(base_path + 'train.csv')
train_df['ImageId'] = train_df['ImageId_ClassId'].apply(lambda x: x.split('_')[0])
train_df['ClassId'] = train_df['ImageId_ClassId'].apply(lambda x: x.split('_')[1])
train_df['hasMask'] = ~ train_df['EncodedPixels'].isna()

crop_size = [256, 320]
nbr = [20, 30]
croplimit = max(nbr[0], nbr[1]-crop_size[1]) + 1

masks_df = train_df.groupby('ImageId').agg(np.sum).reset_index()
masks_df.sort_values('hasMask', ascending=False, inplace=True)
masks_df.head()

Unnamed: 0,ImageId,hasMask
10803,db4867ee8.jpg,3.0
11776,ef24da2ba.jpg,3.0
6284,7f30b9c64.jpg,2.0
9421,bf0c81db6.jpg,2.0
9615,c314f43f3.jpg,2.0


In [5]:
def findNonBlackRegion(img):
    assert len(img.shape) == 2
    thd = img < 20
    vert = np.int32(np.all(thd, axis=0))
    vertshift = vert[:-1] - vert[1:]
    if vert[0] == 1: # left side
        maxind = img.shape[-1]
        minind = np.min(np.where(vertshift==1))
        return [minind, maxind], True
    elif vert[-1] == 1: # right side
        minind = 0
        maxind = np.max(np.where(vertshift==-1)) + 1
        return [minind, maxind], True
    else:
        return [0, img.shape[-1]], False
    
def nonBlackRegion(imgname):
    img = cv2.imread(base_path + 'train_images/' + imgname, 0)
    nbr, _ = findNonBlackRegion(img)
    return nbr

In [6]:
tic = time.time()
masks_df['nonBlackRegion'] = masks_df['ImageId'].apply(nonBlackRegion)
print("load time %fs"%(time.time()-tic)) 

load time 42.636014s


In [None]:
masks_df['nonBlackRegion'].apply(lambda x: abs(int(x.split(' ')[0]) - int(x.split(' ')[1])) < 320).sum()
masks_df.to_pickle('nonBlackRegion.pkl')

In [None]:
sub_df = pd.read_csv(base_path+'sample_submission.csv')
sub_df['ImageId'] = sub_df['ImageId_ClassId'].apply(lambda x: x.split('_')[0])
test_imgs = pd.DataFrame(sub_df['ImageId'].unique(), columns=['ImageId'])

In [None]:
def nonBlackRegionTest(imgname):
    img = cv2.imread(base_path + 'test_images/' + imgname, 0)
    nbr, _ = findNonBlackRegion(img)
    return str(nbr[0]) + ' ' + str(nbr[1])

tic = time.time()
test_imgs['nonBlackRegion'] = test_imgs['ImageId'].apply(nonBlackRegionTest)
print("load time %fs"%(time.time()-tic)) 

In [None]:
test_nbr = list(test_imgs['nonBlackRegion'].apply(lambda x: abs(int(x.split(' ')[0]) - int(x.split(' ')[1]))))
train_nbr = list(masks_df['nonBlackRegion'].apply(lambda x: abs(int(x.split(' ')[0]) - int(x.split(' ')[1]))))

In [None]:
import seaborn as sns
sns.distplot([x for x in test_nbr if x < 1600], kde=False)

In [None]:
sns.distplot([x for x in train_nbr if x < 1600], kde=False)

In [None]:
tic = time.time()
idx = 0
pixels = np.empty(len(train_images)*256*1600, np.uint8)
for i in range(len(train_images)):
    img = cv2.imread(train_images_path + train_images[i], 0)
    nbr, hasblack = [0, 1600], True#findNonBlackRegion(img)
    arr = img[:,nbr[0]:nbr[1]].ravel()
    minval = np.min(arr)
    arrlen = arr.size
    pixels[idx:idx+arrlen] = arr
    idx = idx + arrlen
    #if hasblack:
        #plt.imshow(img, cmap='gray')
        #plt.pause(0.01)
        #print(nonblackregion)
    #if i > 100:
    #    break
    if i %1000 == 0:
        print(i)
pixels = pixels[:idx]
print("load time %fs"%(time.time()-tic)) 

In [None]:
tic = time.time()
idx = 0
pixels_test = np.empty(len(test_images)*256*1600, np.uint8)
for i in range(len(test_images)):
    img = cv2.imread(test_images_path + test_images[i], 0)
    nbr, hasblack = [0, 1600], True#findNonBlackRegion(img)
    arr = img[:,nbr[0]:nbr[1]].ravel()
    minval = np.min(arr)
    arrlen = arr.size
    pixels_test[idx:idx+arrlen] = arr
    idx = idx + arrlen
    #if hasblack:
        #plt.imshow(img, cmap='gray')
        #plt.pause(0.01)
        #print(nonblackregion)
    #if i > 100:
    #    break
    if i %1000 == 0:
        print(i)
pixels_test = pixels_test[:idx]
print("load time %fs"%(time.time()-tic)) 

In [None]:
tic = time.time()
mean_arr = np.mean(pixels)
print("mean time %fs"%(time.time()-tic)) 
print('mean ', mean_arr)

tic = time.time()
std_arr = np.std(pixels)
print("std time %fs"%(time.time()-tic))
print("std ", std_arr)

In [None]:
tic = time.time()
mean_arr = np.mean(pixels_test)
print("mean time %fs"%(time.time()-tic)) 
print('mean ', mean_arr)

tic = time.time()
std_arr = np.std(pixels_test)
print("std time %fs"%(time.time()-tic))
print("std ", std_arr)

In [None]:
import seaborn as sns
sns.distplot(pixels[:100000000])

In [None]:
import seaborn as sns
sns.distplot(pixels[200000000:300000000])

In [None]:
import seaborn as sns
sns.distplot(pixels_test[:100000000])

In [None]:
del pixels

In [None]:
images_path = '/home/isdgenomics/users/dipamcha/kaggle/severstal-steel-defect-detection/data/test_images/'
test_images = [img for img in os.listdir(images_path) if '.jpg' in img]

tic = time.time()
test_imgs_np = np.empty((len(images), 256, 1600), np.float64)
for i in range(len(test_images)):
    test_imgs_np[i] = cv2.imread(images_path + test_images[i], 0)/255.
print("load time %fs"%(time.time()-tic)) 

In [None]:
joined = np.vstack([imgs_np, test_imgs_np])

In [None]:
np.mean(joined)

In [None]:
np.std(joined)

In [None]:
images_path = '/home/isdgenomics/users/dipamcha/kaggle/severstal-steel-defect-detection/data/test_images/'
test_images = [img for img in os.listdir(images_path) if '.jpg' in img]

tic = time.time()
imgs_np = np.empty((len(images), 256, 1600), np.float64)
for i in range(len(images)):
    imgs_np[i] = cv2.imread(images_path + images[i], 0)/255.
print("load time %fs"%(time.time()-tic)) 

tic = time.time()
mean_arr = np.mean(imgs_np)
print("mean time %fs"%(time.time()-tic)) 
print('mean ', mean_arr)

tic = time.time()
std_arr = np.std(imgs_np)
print("std time %fs"%(time.time()-tic))
print("std ", std_arr)

In [None]:
test_images_path = '/home/isdgenomics/users/dipamcha/kaggle/severstal-steel-defect-detection/data/test_images/'
test_images = [img for img in os.listdir(test_images_path) if '.jpg' in img]
count = 0
test_blacklist = []
test_nonblackimgs = []
for imgname in test_images:
    img = cv2.imread(test_images_path + imgname,0)
    thd = img < 20
    if np.all(thd[:, :30]) or np.all(thd[:, -30:]):
        #print(imgname)
        #plt.imshow(img,cmap='gray')
        #plt.pause(0.01)
        #input()
        count += 1
        test_blacklist.append(imgname)
    else:
        test_nonblackimgs.append(img/255.)
print(count)

In [None]:
count = 0
blacklist = []
nonblackimgs = []
for imgname in images:
    img = cv2.imread(train_images_path + imgname,0)
    thd = img < 20
    if np.all(thd[:, :30]) or np.all(thd[:, -30:]):
        #print(imgname)
        #plt.imshow(img,cmap='gray')
        #plt.pause(0.01)
        #input()
        count += 1
        blacklist.append(imgname)
    else:
        nonblackimgs.append(img)
print(count)

In [None]:
np.mean(np.array(nonblackimgs))

In [None]:
np.mean(np.array(test_nonblackimgs))

In [None]:
101.30590487340685/255