<a href="https://colab.research.google.com/github/feknall/machinelearning-playground/blob/main/Gras_Winter2022_Segmentation_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade matplotlib

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import matplotlib.patches as patches
import re
import random 
import pickle
import cv2
import seaborn as sns
from PIL import Image
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split


In [None]:
dataset_path = '/content/drive/MyDrive/ai-gras-2/dataset'

In [None]:
train_csv = pd.read_csv(f'{dataset_path}/train.csv')
train_csv.head()

In [None]:
train_csv.describe()

In [None]:
train_csv.shape

In [None]:
image_id = []
label = []
train_folder_path = f'{dataset_path}/train_images'
for i in os.listdir(train_folder_path):
    for j in range(1,5):
        image_id.append(i)
        label.append(j)

x = {'ImageId': image_id, 'ClassId': label}
train_img = pd.DataFrame(x)
train_img.head(10)

In [None]:
limit = 10
image_size = set()
train_folder_path = f'{dataset_path}/train_images'
for i in os.listdir(train_folder_path)[:limit]:
  image_size.add(cv2.imread(train_folder_path + '/' + i).shape)
unique_image = list(image_size)
for x in unique_image:
  print(x)

In [None]:
df = pd.merge(train_img, train_csv, how='outer', on=['ImageId','ClassId'])
df.fillna('',inplace=True)
df.head()

In [None]:
train = pd.pivot_table(df, values='EncodedPixels', index='ImageId', columns='ClassId', aggfunc=np.sum).astype(str)
train = train.reset_index()
train.columns = ['image_id','rle_1','rle_2','rle_3','rle_4'] 
train.head()

In [None]:
train.describe()

In [None]:
defect = []
stratify = []
for i in range(len(train)):
  if (train['rle_1'][i] != '' or train['rle_2'][i] != '' or train['rle_3'][i] != '' or train['rle_4'][i] != ''):
    defect.append(1)
  else:
    defect.append(0)
  
  if train['rle_1'][i] != '':
    stratify.append(1)
  elif train['rle_2'][i] != '':
    stratify.append(2)
  elif train['rle_3'][i] != '':
    stratify.append(3)
  elif train['rle_4'][i] != '':
    stratify.append(4)
  else:
    stratify.append(0)
train['defect'] = defect
train['stratify'] = stratify

In [None]:
train.head()

In [None]:
defect_1, defect_2, defect_3, defect_4 = [], [], [], []
for i in range(len(train)):
  if train['rle_1'][i] != '':
    defect_1.append(1)
  else:
    defect_1.append(0)
  if train['rle_2'][i] != '':
    defect_2.append(1)
  else:
    defect_2.append(0)
  if train['rle_3'][i] != '':
    defect_3.append(1)
  else:
    defect_3.append(0)
  if train['rle_4'][i] != '':
    defect_4.append(1)
  else:
    defect_4.append(0)
train['defect_1'] = defect_1
train['defect_2'] = defect_2
train['defect_3'] = defect_3
train['defect_4'] = defect_4
train['total_defects'] = train['defect_1'] + train['defect_2'] + train['defect_3'] + train['defect_4']
train.head()

In [None]:
with open(f'{dataset_path}/data.pkl','wb') as f:
     pickle.dump(train,f)

In [None]:
test_image = [i for i in os.listdir(f'{dataset_path}/test_images')]

In [None]:
defect_1, defect_2, defect_3, defect_4, no_defect = 0, 0, 0, 0, 0


def func(v, p):
    a = p * sum(v) / 100
    return "{:.2f}%\n({:.0f})".format(p, a)


for i in range(len(train)):
    if train['rle_1'][i] != '':
        defect_1 += 1
    if train['rle_2'][i] != '':
        defect_2 += 1
    if train['rle_3'][i] != '':
        defect_3 += 1
    if train['rle_4'][i] != '':
        defect_4 += 1
    if train['defect'][i] == 0:
        no_defect += 1
labels = ['defect_1', 'defect_2', 'defect_3', 'defect_4', 'no_defect']
sizes = [defect_1, defect_2, defect_3, defect_4, no_defect]
explode = (0.2, 0.3, 0.1, 0.1, 0.1)
fig, ax = plt.subplots(figsize=(32, 16))
ax.pie(sizes, explode=explode, labels=labels, textprops={'fontsize': 28}, autopct=lambda p: func(sizes, p), shadow=True)
# fig.suptitle('Defect Distribution', fontsize=25, fontweight='bold')
# fig.set_facecolor("tan")
plt.show()


In [None]:
def patch1(bar, ax):
    for p in bar.patches:
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy()
        ax.annotate('{}'.format(height), (x + width / 2, y + height * 1.02), ha='center', fontsize=14)

In [None]:
counts_sorted = train['total_defects'].value_counts().sort_index()
print(counts_sorted)
print(type(counts_sorted))

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
a = counts_sorted.plot(kind='bar')
patch1(a, ax)
ax.set_xlabel("Number of Defects", fontsize=18, labelpad=15)
ax.set_ylabel("Number of Images", fontsize=18, labelpad=15)
plt.xticks(rotation='horizontal', fontsize=14)
plt.yticks(fontsize=14)
# fig.suptitle('No. of Labels per Image', fontsize=25, fontweight='bold')
# ax.set_facecolor("tan")
# fig.set_facecolor("tan")
plt.show()


In [None]:
def rle_to_mask(rle):
    # CONVERT RLE TO MASK 
    if (pd.isnull(rle)) | (rle == '') | (rle == '-1'):
        return np.zeros((256, 1600), dtype=np.uint8)

    height = 256
    width = 1600
    mask = np.zeros(width * height, dtype=np.uint8)

    array = np.asarray([int(x) for x in rle.split()])
    starts = array[0::2] - 1
    lengths = array[1::2]
    for index, start in enumerate(starts):
        mask[int(start):int(start + lengths[index])] = 1

    return mask.reshape((height, width), order='F')


In [None]:
def plot_mask(rle_defect, k):
    x = rle_defect.columns[2]
    train_folder_path = f'{dataset_path}/train_images/'
    # Create figure and axes
    fig, ax = plt.subplots(4, 2, figsize=(28, 18))
    # fig.suptitle('Defect Type:' + str(k), fontsize=40, fontweight='bold')
    for i in range(4):
        image_id = rle_defect['image_id'][i]
        rle = rle_defect[x][i]
        im = Image.open(train_folder_path + str(image_id))
        ax[i, 0].imshow(im)
        ax[i, 0].set_title(image_id)
        mask = rle_to_mask(rle)
        ax[i, 1].imshow(mask)
        ax[i, 1].set_title("Mask for " + str(image_id))
    # fig.set_facecolor("tan")
    plt.subplots_adjust(wspace=0.1, hspace=0.0)
    plt.show()


In [None]:
rle_defect = train[train['defect_1'] == 1]
rle_defect = rle_defect[['image_id', 'rle_1']]
rle_defect = rle_defect.sample(n=4)
rle_defect = rle_defect.reset_index()
plot_mask(rle_defect, 1)

In [None]:
rle_defect = train[train['defect_2'] == 1]
rle_defect = rle_defect[['image_id', 'rle_2']]
rle_defect = rle_defect.sample(n=4)
rle_defect = rle_defect.reset_index()
plot_mask(rle_defect, 2)


In [None]:
rle_defect = train[train['defect_3'] == 1]
rle_defect = rle_defect[['image_id', 'rle_3']]
rle_defect = rle_defect.sample(n=4)
rle_defect = rle_defect.reset_index()
plot_mask(rle_defect, 3)

In [None]:
rle_defect = train[train['defect_4'] == 1]
rle_defect = rle_defect[['image_id', 'rle_4']]
rle_defect = rle_defect.sample(n=4)
rle_defect = rle_defect.reset_index()
plot_mask(rle_defect, 4)

In [None]:
def mask_areas(rle_defect):
    area = []
    for i in rle_defect:
        mask = np.sum(rle_to_mask(i))
        area.append(np.sum(rle_to_mask(i)))
    return area

In [None]:
rle_defect = train[train['defect_1'] == 1]
rle_defect = rle_defect['rle_1']
rle_1_area = mask_areas(rle_defect)

rle_defect = train[train['defect_2'] == 1]
rle_defect = rle_defect['rle_2']
rle_2_area = mask_areas(rle_defect)

rle_defect = train[train['defect_3'] == 1]
rle_defect = rle_defect['rle_3']
rle_3_area = mask_areas(rle_defect)

rle_defect = train[train['defect_4'] == 1]
rle_defect = rle_defect['rle_4']
rle_4_area = mask_areas(rle_defect)


In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 7))

ax[0, 0].hist(x=rle_1_area)
ax[0, 0].set_xlabel("Mask Area : Defect_1", fontsize=13)
ax[0, 0].set_ylabel("No. of Images", fontsize=13)
ax[0, 0].set_facecolor("tan")

ax[0, 1].hist(x=rle_2_area)
ax[0, 1].set_xlabel("Mask Area : Defect_2", fontsize=13)
ax[0, 1].set_ylabel("No. of Images", fontsize=13)
ax[0, 1].set_facecolor("tan")

ax[1, 0].hist(x=rle_3_area)
ax[1, 0].set_xlabel("Mask Area : Defect_3", fontsize=13)
ax[1, 0].set_ylabel("No. of Images", fontsize=13)
ax[1, 0].set_facecolor("tan")

ax[1, 1].hist(x=rle_4_area)
ax[1, 1].set_xlabel("Mask Area : Defect_4", fontsize=13)
ax[1, 1].set_ylabel("No. of Images", fontsize=13)
ax[1, 1].set_facecolor("tan")

fig.set_facecolor("tan")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))

sns.kdeplot(rle_1_area, label='Defect 1')
sns.kdeplot(rle_2_area, label='Defect 2')
sns.kdeplot(rle_3_area, label='Defect 3')
sns.kdeplot(rle_4_area, label='Defect 4')
plt.legend()
# ax.set_facecolor("tan")
# fig.set_facecolor("tan")
plt.ylabel('Density', fontsize=13)
plt.xlabel('Mask Area', fontsize=13)
plt.show()


In [None]:
rle_defect = train[train['defect_3'] == 1]
rle = rle_defect['rle_3']
rle_3_area = mask_areas(rle)
rle_defect['rle_3_area'] = rle_3_area
rle_defect = rle_defect[rle_defect['rle_3_area'] > 200000]
rle_defect = rle_defect[['image_id', 'rle_3']]
rle_defect = rle_defect.sample(n=4)
rle_defect = rle_defect.reset_index()
plot_mask(rle_defect, 3)