In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import numpy as np
import pandas as pd
import os
import time
from shutil import copyfile

In [None]:
!ls "gdrive/MyDrive/Machine Vision and Image Processing/Project/Dataset"

test.zip  train.zip


In [None]:
!unzip -q "gdrive/MyDrive/Machine Vision and Image Processing/Project/Dataset/train.zip"

## Preparing Dataset

<u>Currently, the labels are as follows:</u>

|   Pollen type   | Label |
|:---------------:|:-----:|
|Normal Pollen    |    1  |
|Anomalous Pollen |    2  |
|Alnus Pollen     |    3  |
|Debris           |    4  |




### But, here for the network training, we will convert the labels from 0 to 3

<u>Hence, are processing the labels will be as follows:</u>

|   Pollen type   | Label |
|:---------------:|:-----:|
|Normal Pollen    |    0  |
|Anomalous Pollen |    1  |
|Alnus Pollen     |    2  |
|Debris           |    3  |


### First of all creating a csv file containing the name of the file and labels.

Also, the dataset contains the segmentation mask and segmented images as well. 

However for this work we will only work with normal images.

Also, the test image provided by the organizer of the competition does not contains any labels thus we cannot evaluate using that test image. Hence, we will be using the provided training image for training, validation, and testing of the model.

#### In this below part, first we will create the CSV file which contains the label and name of the image in a csv file. In addition, we will copy the images from separate folders to single folder for easing the coding of data loader later.

In [None]:
data_folder_name = "train_OBJ"
DATA_PATH = "train/images"

In [None]:
# The below code will associate each image with the corresponding labels
# df_dict will hold all the name of the images and the labels so that later we can convert it to csv file easily
df_dict = {
    "image_id": [],
    "label" : []
          }
for label in os.listdir(DATA_PATH):
  image_folder = os.path.join(DATA_PATH, label, data_folder_name)
  for image in os.listdir(image_folder):
    if image.endswith(".png"): # Conditional for selecting only the image file that ends with .png
      if image in df_dict["image_id"]: # Checking if any name of the image is repeated in the dataset
        print(image)
      df_dict["image_id"].append(image)
      df_dict["label"].append(int(label)-1)

print(len(df_dict["image_id"]))
print(len(df_dict["label"]))

11279
11279


In [None]:
df = pd.DataFrame(df_dict)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11279 entries, 0 to 11278
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   image_id  11279 non-null  object
 1   label     11279 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 176.4+ KB


In [None]:
# Copying the data from each directories to single folder

final_directory = "Pollen_data/images"
if not os.path.exists(final_directory):
  os.makedirs(final_directory)
else:
  print("Folder already exists!")

for label in os.listdir(DATA_PATH):
  image_folder = os.path.join(DATA_PATH, label, data_folder_name)
  for image in os.listdir(image_folder):
    if image.endswith(".png"): # Conditional for selecting only the image file that ends with .png
      copyfile(os.path.join(image_folder, image), os.path.join(final_directory, image))

Folder already exists!


In [None]:
# Checking if the files were properly copied into the folder

print("Total images in csv file : {}".format(len(df_dict["image_id"])))
print("Total images in the directory : {}".format(len(os.listdir(final_directory))))

Total images in csv file : 11279
Total images in the directory : 11279


In [None]:
# Saving the csv file in the same Pollen_data directory
df.to_csv("Pollen_data/data.csv", index=False)

In [None]:
# Saving this processed data as zipped format for faster transfer and backup
!zip -r Pollen_data.zip Pollen_data

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: Pollen_data/images/20190404113140_OBJ_37_271_100.png (deflated 3%)
  adding: Pollen_data/images/20190404112024_OBJ_11_351_482.png (deflated 2%)
  adding: Pollen_data/images/20190404110633_OBJ_25_416_373.png (deflated 2%)
  adding: Pollen_data/images/20190404112119_OBJ_28_336_363.png (deflated 1%)
  adding: Pollen_data/images/20190404110941_OBJ_2_67_589.png (deflated 2%)
  adding: Pollen_data/images/20190404114139_OBJ_21_475_92.png (deflated 1%)
  adding: Pollen_data/images/20190404110532_OBJ_14_183_126.png (deflated 2%)
  adding: Pollen_data/images/20190404110338_OBJ_43_698_147.png (deflated 1%)
  adding: Pollen_data/images/20190404114003_OBJ_12_1152_486.png (deflated 2%)
  adding: Pollen_data/images/20190404113801_OBJ_8_918_607.png (deflated 2%)
  adding: Pollen_data/images/20190404110452_OBJ_8_1137_649.png (deflated 2%)
  adding: Pollen_data/images/20190404112750_OBJ_5_625_569.png (deflated 2%)
  adding: Polle

# Saving in drive for backup, and for use in another notebook for model training and all

In [None]:
!cp Pollen_data.zip "gdrive/MyDrive/Machine Vision and Image Processing/Project/Dataset"