# Create Dataset for models

This notebook is used to create dataset for all the models such as CNN and CNN-SVM models. 
 - For CNN-SVM models .csv files are created for train and test set to extract features separately using extract_features_from_images.py file
 - For CNN models, images are copied from PKLot Segmented folder into separate dataset folder with below folder structure
     - cnn_dataset
         - train
             - Empty
             - Occupied
         - valid
             - Empty
             - Occupied

In [1]:
from shutil import copy2
from glob import glob
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(1381)

%matplotlib inline

In [2]:
# Set the folders and its path to create train and validation split
# Model is trained on images from one parking lot and tested on another one
root_dir = "/Volumes/sddrv/datasets/PKLot/PKLotSegmented/"
classes = ["Empty", "Occupied"]

datasets = {
    "train": ["UFPR04/Sunny/", "UFPR04/Rainy/", "UFPR04/Cloudy/", "UFPR05/Sunny/", "UFPR05/Rainy/", "UFPR05/Cloudy/"],
    "valid": ["PUC/Sunny/","PUC/Rainy/", "PUC/Cloudy/"]
}

In [3]:
# Split the images into train and validation dataset and write the list as csv file
train_dataset = []
valid_dataset = []
                  
for c in classes:
    for data_type in datasets:
        for folder in datasets[data_type]:
            dir_content = [d for d in os.listdir(os.path.join(root_dir,folder)) if os.path.isdir(os.path.join(root_dir,folder,d))]
            #print(dir_content)
            for d in dir_content:
                folder_path = os.path.join(root_dir,folder,d, c)
                images = glob(os.path.join(folder_path, "*.jpg"))
                if len(images)>0:
                    if data_type == 'train':
                        sample_size = 0.3
                    else:
                        sample_size = 0.1
                    random_sample = np.random.choice(images, replace=False,size=int(len(images)*sample_size))
                    #print(len(random_sample))
                    for img in random_sample:
                        image_name = img.split("/")[-1]
                        temp = {}
                        temp['image_name'] = image_name
                        temp["label"] = c
                        temp['folder_path'] = os.path.join(folder,d)
                        temp['data_type'] = data_type
                        if data_type == 'train':
                            train_dataset.append(temp)
                        else:
                            valid_dataset.append(temp)

In [4]:
# convert the list into dataframe to store them as csv file
df_train = pd.DataFrame(train_dataset)
df_valid = pd.DataFrame(valid_dataset)

# shuffle the dataset
df_train = df_train.sample(frac=1, random_state=1431)
df_valid = df_valid.sample(frac=1, random_state=1431)

In [5]:
print("Train dataset size: ", df_train.shape)
print("Valid dataset size: ", df_valid.shape)

Train dataset size:  (81406, 4)
Valid dataset size:  (42384, 4)


In [8]:
df_train

Unnamed: 0,image_name,label,folder_path,data_type
75820,2013-03-05_08_20_02#021.jpg,Occupied,UFPR05/Cloudy/2013-03-05,train
29303,2013-02-23_11_25_05#026.jpg,Empty,UFPR05/Sunny/2013-02-23,train
21360,2013-04-14_15_35_10#024.jpg,Empty,UFPR05/Sunny/2013-04-14,train
67242,2013-03-06_11_25_06#011.jpg,Occupied,UFPR05/Sunny/2013-03-06,train
61377,2013-04-15_15_45_10#022.jpg,Occupied,UFPR05/Sunny/2013-04-15,train
...,...,...,...,...
13348,2013-01-19_12_50_08#002.jpg,Empty,UFPR04/Cloudy/2013-01-19,train
36492,2013-03-13_07_05_01#013.jpg,Empty,UFPR05/Cloudy/2013-03-13,train
9848,2013-01-15_14_41_51#014.jpg,Empty,UFPR04/Cloudy/2013-01-15,train
30160,2013-03-06_06_35_00#031.jpg,Empty,UFPR05/Sunny/2013-03-06,train


In [9]:
df_valid

Unnamed: 0,image_name,label,folder_path,data_type
39424,2012-10-31_13_53_19#012.jpg,Occupied,PUC/Cloudy/2012-10-31,valid
26136,2012-09-18_16_05_14#082.jpg,Occupied,PUC/Sunny/2012-09-18,valid
7062,2012-09-12_18_07_29#035.jpg,Empty,PUC/Sunny/2012-09-12,valid
13690,2012-10-11_08_41_33#096.jpg,Empty,PUC/Rainy/2012-10-11,valid
4863,2012-10-15_10_35_52#037.jpg,Empty,PUC/Sunny/2012-10-15,valid
...,...,...,...,...
13348,2012-10-11_15_31_51#011.jpg,Empty,PUC/Rainy/2012-10-11,valid
36492,2012-10-26_09_14_33#031.jpg,Occupied,PUC/Rainy/2012-10-26,valid
9848,2012-10-29_06_27_52#016.jpg,Empty,PUC/Sunny/2012-10-29,valid
21279,2012-10-28_12_52_04#044.jpg,Empty,PUC/Cloudy/2012-10-28,valid


In [52]:
def show_label_distribution(df):
    df_class = pd.DataFrame(df['label'].value_counts())
    df_class = df_class.reset_index()
    #df_class = df_class.rename(columns={'index': 'label', 'label': 'count'})
    df_class['percentage'] = round((df_class['count'] / sum(df_class['count']))*100, 2)
    return df_class

In [53]:
train_df = show_label_distribution(df_train)
print("Train dataset ", train_df)
valid_df = show_label_distribution(df_valid)
print("Valid dataset ", valid_df)

Train dataset        label  count  percentage
0  Occupied  43027       52.85
1     Empty  38379       47.15
Valid dataset        label  count  percentage
0     Empty  22980       54.22
1  Occupied  19404       45.78


### Create index file for CNN-SVM models

In [56]:
df_train.to_csv('pklot_dataset/dataset_train.csv', index=False)
df_valid.to_csv('pklot_dataset/dataset_valid.csv', index=False)

In [57]:
df_train.head(3)

Unnamed: 0,image_name,label,folder_path,data_type
75820,2013-03-05_08_20_02#021.jpg,Occupied,UFPR05/Cloudy/2013-03-05,train
29303,2013-02-23_11_25_05#026.jpg,Empty,UFPR05/Sunny/2013-02-23,train
21360,2013-04-14_15_35_10#024.jpg,Empty,UFPR05/Sunny/2013-04-14,train


In [58]:
df_valid.head(3)

Unnamed: 0,image_name,label,folder_path,data_type
39424,2012-10-31_13_53_19#012.jpg,Occupied,PUC/Cloudy/2012-10-31,valid
26136,2012-09-18_16_05_14#082.jpg,Occupied,PUC/Sunny/2012-09-18,valid
7062,2012-09-12_18_07_29#035.jpg,Empty,PUC/Sunny/2012-09-12,valid


## Create train and valid dataset for CNN models

In [59]:
import pandas as pd
import os
from shutil import copy2

In [60]:
#root_dir = "../../datasets/PKLot/PKLotSegmented/"
df_train = pd.read_csv("pklot_dataset/dataset_train.csv")
df_valid = pd.read_csv("pklot_dataset/dataset_valid.csv")

In [61]:
cnn_dataset = "/Volumes/sddrv/datasets/PKLot/cnn_dataset/"


if not os.path.isdir(cnn_dataset):
    os.makedirs(cnn_dataset)
    print('CNN dataset folder created at ', cnn_dataset)

Create sample datataset. To create full train and valid dataset set the sample size to 100000 (size greater than number of images in df_train and df_valid to include all the images)

In [62]:
# set sample size to create sample dataset
train_sample_size = 5000
valid_sample_size = 500

### Copy train dataset

In [63]:
if not os.path.isdir(os.path.join(cnn_dataset, 'train', 'Occupied')):
    os.makedirs(os.path.join(cnn_dataset, 'train', 'Occupied'))
    print('train/Occupied folder created at ', cnn_dataset)

if not os.path.isdir(os.path.join(cnn_dataset, 'train', 'Empty')):
    os.makedirs(os.path.join(cnn_dataset, 'train', 'Empty'))
    print('train/Empty folder created at ', cnn_dataset)

count = 1
for idx in df_train.index.values[:train_sample_size]:
    src = os.path.join(root_dir, df_train.loc[idx, 'folder_path'], df_train.loc[idx, 'label'], df_train.loc[idx, 'image_name'])
    if df_train.loc[idx, 'label'] == 'Occupied':
        dst = os.path.join(cnn_dataset, 'train', 'Occupied')
    else:
        dst = os.path.join(cnn_dataset, 'train', 'Empty')
        
    copy2(src, dst)
    count+=1

print("Total train images:", count)

train/Occupied folder created at  /Volumes/sddrv/datasets/PKLot/cnn_dataset/
train/Empty folder created at  /Volumes/sddrv/datasets/PKLot/cnn_dataset/
Total train images: 5001


### Copy valid dataset

In [65]:
if not os.path.isdir(os.path.join(cnn_dataset, 'valid', 'Occupied')):
    os.makedirs(os.path.join(cnn_dataset, 'valid', 'Occupied'))
    print('valid/Occupied folder created at ', cnn_dataset)

if not os.path.isdir(os.path.join(cnn_dataset, 'valid', 'Empty')):
    os.makedirs(os.path.join(cnn_dataset, 'valid', 'Empty'))
    print('valid/Empty folder created at ', cnn_dataset)

count = 1
for idx in df_valid.index.values[:valid_sample_size]:
    src = os.path.join(root_dir, df_valid.loc[idx, 'folder_path'], df_valid.loc[idx, 'label'], df_valid.loc[idx, 'image_name'])
    if df_valid.loc[idx, 'label'] == 'Occupied':
        dst = os.path.join(cnn_dataset, 'valid', 'Occupied')
    else:
        dst = os.path.join(cnn_dataset, 'valid', 'Empty')
        
    copy2(src, dst)
    count+=1
    
print("Total valid images:", count)

Total valid images: 501
