# Import libraries and data


In [None]:
!pip install ipyplot

In [None]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from tensorflow.keras.applications.resnet50 import ResNet50
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle
from PIL import Image, ImageFilter
import csv
import ipyplot
import glob
import cv2
from tqdm import tqdm

In [None]:
train_csv = pd.read_csv('../input/sapienza-training-camp-2022/train.csv')
train_csv["path"] = "/content/train/train/" + train_csv["category_name"] + "/" + train_csv["file_name"]

class_name = sorted(train_csv['category_name'].unique())
print(class_name)

In [None]:
train_csv.head()

# Delete noisy data

In order to remove the noisy data we apply the following strategy:


*   feature extractor using VGG16 model
*   reduce the number of feature trough a PCA
*   use kmeans to saparate in two classes (ship / noise)



In [None]:
def dataset_filename(dataset_class):
    # retrieve the image name
    input_folder = '../input/sapienza-training-camp-2022/train/train/'+dataset_class
    images_filename = []
    for image_file in os.listdir(input_folder):
        images_filename.append(image_file)

    return images_filename

def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img)
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

def cleaning_dataset(df, dataset_class):
    images_filename = list(df.file_name)
    
    # load the model first and pass as an argument
    model = VGG16()
    model = Model(inputs = model.inputs, outputs = model.layers[-2].output)
    
    data = {}
    # lop through each image in the dataset
    input_folder = '../input/sapienza-training-camp-2022/train/train/'+ dataset_class
    for image in tqdm(images_filename):
        feat = extract_features(os.path.join(input_folder, image), model)
        data[image] = feat
        
    # get a list of the filenames
    filenames = np.array(list(data.keys()))
    
    # get a list of just the features
    feat = np.array(list(data.values()))
    feat = feat.reshape(-1,feat.shape[-1])
    # reduce the feeature dimension
    n_components = 100
    if len(feat) < 100:
        n_components = len(feat)
    print("n_componets: ", n_components)
    pca = PCA(n_components=n_components, random_state=123)
    pca.fit(feat)
    x = pca.transform(feat)
    
    # apply kmeans to separate the image
    kmeans = KMeans(n_clusters=2, random_state=123)
    kmeans.fit(x)
    
    # holds the cluster id and the images { id: [images] }
    groups = {}
    for file, cluster in zip(filenames,kmeans.labels_):
        if cluster not in groups.keys():
            groups[cluster] = []
            groups[cluster].append(file)
        else:
            groups[cluster].append(file)
            
    return groups

def cluster_asarray(groups, dataset_class):

    input_folder = '../input/sapienza-training-camp-2022/train/train/'+dataset_class

    list1 = groups[1]
    list0 = groups[0]
    images_1 = [] 
    images_0 = [] 

    for image in list1:
        image = np.array(Image.open(os.path.join(input_folder, image)))
        image = cv2.resize(image,(244,244))
        images_1.append(image)

    for image in list0:
        image = np.array(Image.open(os.path.join(input_folder, image)))
        image = cv2.resize(image,(244,244))
        images_0.append(image)
        
    return images_0, images_1

def generation_groups(df, dataset_class):
    ds_class = cleaning_dataset(df, dataset_class)
    cluster0, cluster1 = cluster_asarray(ds_class, dataset_class)
    print("class 0: ", len(cluster0))
    print("class 1: ", len(cluster1))
    return cluster0, cluster1, ds_class

Apply the process of cleaning

In [None]:
dataset0 = []
dataset1 = []
groups = []
for class_ in class_name:
    print("we are in class: ", class_)
    cluster0, cluster1, ds_class = generation_groups(class_)
    dataset0.append(cluster0)
    dataset1.append(cluster1)
    groups.append(ds_class)

Separate the img in two right class (since kmeans doesn't define always the same class as clean or noise for all the process)

In [None]:
groups_clean  = [groups[0][0], groups[1][1], groups[2][1], groups[3][0], 
                 groups[4][0], groups[5][1], groups[6][0]]
groups_remove = [groups[0][1], groups[1][0], groups[2][0], groups[3][1],
                 groups[4][1], groups[5][0], groups[6][1]]

In [None]:
dataset_clean = [dataset0[0], dataset1[1], dataset1[2], dataset0[3],
                 dataset0[4], dataset1[5], dataset0[6]]

dataset_remove = [dataset1[0], dataset0[1], dataset0[2], dataset1[3], 
              dataset1[4], dataset0[5], dataset1[6]]

Save as csv for later usage

In [None]:
def csv_class(groups_clean, groups_remove, category, category_id):
    category_dict = {'file_name': groups_clean, 'category_name': category, 'category_id': category_id}
    df_clean = pd.DataFrame.from_dict(category_dict)
    
    category_dict = {'file_name': groups_remove, 'category_name': category, 'category_id': category_id}
    df_dirty = pd.DataFrame.from_dict(category_dict)
    return df_clean, df_dirty

In [None]:
battleship_clean, battleship_dirty = csv_class(groups_clean[0], groups_remove[0], class_name[0], 0)
coast_clean, coast_dirty = csv_class(groups_clean[1], groups_remove[1], class_name[1], 1)
container_clean, container_dirty = csv_class(groups_clean[2], groups_remove[2], class_name[2], 2)
cruise_clean, cruise_dirty = csv_class(groups_clean[3], groups_remove[3], class_name[3], 3)
drilling_clean, drilling_dirty = csv_class(groups_clean[4], groups_remove[4], class_name[4], 4)
motor_clean, motor_dirty = csv_class(groups_clean[5], groups_remove[5], class_name[5], 5)
submarines_clean, submarines_dirty = csv_class(groups_clean[6], groups_remove[6], class_name[6], 6)

## Final cleaning

Since we can have still some dirty noise in same classes because KMeans is not a perfect classifier we re-do the process with smaller and better separable subset 

In [None]:
x = [battleship_dirty, coast_dirty, container_dirty, cruise_dirty, drilling_dirty,
    motor_dirty, submarines_dirty]

dataset0 = []
dataset1 = []
groups = []
for df, dataset_class in zip(x, class_name):
    print("we are in class: ", dataset_class)
    cluster0, cluster1, ds_class = generation_groups(df, dataset_class)
    dataset0.append(cluster0)
    dataset1.append(cluster1)
    groups.append(ds_class)

In [None]:
dataset_clean = [dataset0[1], dataset0[4], dataset0[5], dataset1[6]]
groups_clean = [groups[1][0], groups[4][0], groups[5][0], groups[6][1]]
groups_dirty = [groups[4][1]]

In [None]:
def csv_class(groups, category, category_id):
    category_dict = {'file_name': groups, 'category_name': category, 'category_id': category_id}
    df = pd.DataFrame.from_dict(category_dict)
    return df

In [None]:
new_drilling_dirty = csv_class(groups_dirty[0], class_name[4], 4)
cluster0, cluster1, ds_class = generation_groups(new_drilling_dirty, 'drilling-rigs')

In [None]:
new_drilling_clean = csv_class(ds_class[0], class_name[4], 4)

In [None]:
# save new dataframe
new_drilling_clean = csv_class(cluster0, class_name[4], 4)
new_coast_clean = csv_class(groups_clean[0], class_name[1], 1)
new_drilling2_clean = csv_class(groups_clean[1], class_name[4], 4)
new_motor_clean = csv_class(groups_clean[2], class_name[5], 5)
new_submarines_clean = csv_class(groups_clean[3], class_name[6], 6)

In [None]:
coast_clean2 = pd.concat([coast_clean, new_coast_clean])
drilling3 = pd.concat([drilling_clean, new_drilling_clean, new_drilling2_clean])
motor = pd.concat([motor_clean, new_motor_clean])
submarines = pd.concat([submarines_clean, new_submarines_clean])
drilling3 = pd.concat([drilling_clean, new_drilling_clean, new_drilling2_clean])

Final csv

In [None]:
battleship_clean.to_csv('./battleship.csv')
coast_clean2.to_csv('./coast_guard.csv')
container_clean.to_csv('./container.csv')
cruise_clean.to_csv('./cruise.csv')
drilling3.to_csv('./drilling.csv')
motor.to_csv('./motor.csv')
submarines.to_csv('./submarines.csv')

In [None]:
ipyplot.plot_images(cluster1, max_images=155)

In [None]:
def write_image(dataset_clean,dataset_remove, groups_clean, groups_remove, dataset_class):

    input_folder = '../input/sapienza-training-camp-2022/train/train/'+dataset_class
    output_folder = './'+dataset_class
    
    os.mkdir(output_folder)
    os.mkdir(output_folder+'/clean')
    os.mkdir(output_folder+'/noisy')
    
    for count, image in enumerate(dataset_clean):
        im = Image.open(os.path.join(input_folder, image))
        im.save(output_folder+'/clean'+'/'+dataset_class+str(count).zfill(6) +'.jpg')
    
    for count, image in enumerate(dataset_remove):
        im = Image.open(os.path.join(input_folder, image))
        im.save(output_folder+'/noisy'+'/'+dataset_class+str(count).zfill(6) +'.jpg')