In [1]:
# required for jupyter notebook
%matplotlib inline 

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import os
import math

### Upon analysis we found that folder 74 contained image files with '_75' at the end of their file names. The code below fixes such file names.

In [2]:
images_dir = os.path.join('..', 'dataset', 'BanglaLekha-Isolated', 'Images')

for i in range(1, 84+1):
    
    curr_dir = os.path.join(images_dir, str(i))
    
    for file_name in os.listdir(curr_dir): 
        if os.path.isfile(os.path.join(curr_dir, file_name)) == False:
            continue
        
        file_name_parts = file_name.split('_')
        
        file_folder = file_name_parts[6]
        
        if file_folder!=str(i)+'.png':
            new_file_name=''
            for j in range(6):
                new_file_name+=file_name_parts[j]+'_'
            new_file_name+=str(i)+'.png'
            
            #print(file_name, ' -> ', new_file_name)
            os.rename(os.path.join(curr_dir, file_name), os.path.join(curr_dir, new_file_name))

### Mean aesthetic score = 2.455

In [3]:
MEAN_AES_SCORE = 2.455

In [4]:
raw_aestheticScores_df = pd.read_csv(os.path.join('..', 'dataset', 'BanglaLekha-Isolated', 'aesthetic-scores.csv'))

In [5]:
raw_aestheticScores_df.head()

Unnamed: 0,Form No,Out Of 5 (1st Person),Out of 5 (2nd Person),Out of 5 (3rd Person),Average
0,1,2,4,5,3.666667
1,2,1,3,4,2.666667
2,3,3,5,4,4.0
3,4,2,3,4,3.0
4,5,1,3,3,2.333333


In [6]:
'''
returns a map of key->value : form_id->aesthetic_score
'''
def get_form_aesScores(aestheticScores_df):
    
    form_aesScores = {} 
    
    form_ids = []
    aes_scores = []
    for form_id in aestheticScores_df['Form No']:
        form_ids.append(form_id)
    for aes_score in aestheticScores_df['Average']:
        aes_scores.append(aes_score)
        
    for i in range(len(form_ids)):
        form_aesScores[form_ids[i]] = aes_scores[i]
        
    return form_aesScores

In [7]:
'''
create 4 PARALLEL lists- image_file_name[], aesthetic_score[], aesthetic_quality[], prob_good[]
image_file_name[] = name of the sample image file
aesthetic_score[] = aesthetic score assigned in decimal point number
aesthetic_quality[] = label- 'good' or 'bad'
prob_good[] = 1 if good, 0 if bad 

returns a dataframe with each list as a column
'''
def prepare_hwAesthetics_df(mp_formWise_aesScores, begin=1, end=84, mean_aesScore = 2.455):

    image_file_name, aesthetic_score, aesthetic_quality, prob_good = [], [], [], []
    
    images_dir = os.path.join('..', 'dataset', 'BanglaLekha-Isolated', 'Images')
    
    for i in range(begin, end+1):
        
        curr_dir = os.path.join(images_dir, str(i))
        
        for file_name in os.listdir(curr_dir): 
            if os.path.isfile(os.path.join(curr_dir, file_name)) == False:
                continue
                
            form_id = int(file_name.split('_')[5])
            
            if form_id not in mp_formWise_aesScores:
                continue
            
            aes_score = mp_formWise_aesScores[form_id]
            
            image_file_name.append(file_name)
            aesthetic_score.append(aes_score)
            if aes_score >= mean_aesScore:
                aesthetic_quality.append('good')
                prob_good.append(1)
            else:
                aesthetic_quality.append('bad')
                prob_good.append(0)
                
    return pd.DataFrame({'image_file_name': image_file_name,
                       'aesthetic_score': aesthetic_score,
                       'aesthetic_quality': aesthetic_quality,
                       'probability_good': prob_good})

In [8]:
mp_formWise_aesScores = get_form_aesScores(raw_aestheticScores_df)
print(len(mp_formWise_aesScores))

hw_aes_df = prepare_hwAesthetics_df(mp_formWise_aesScores)
hw_aes_df.head()

2000


Unnamed: 0,image_file_name,aesthetic_score,aesthetic_quality,probability_good
0,02_0002_0_23_1016_1067_1.png,1.333333,bad,0
1,02_0002_0_23_1016_1730_1.png,1.333333,bad,0
2,01_0001_0_15_0916_0422_1.png,3.666667,good,1
3,02_0002_0_24_1016_1542_1.png,2.0,bad,0
4,01_0001_1_16_0916_0320_1.png,4.0,good,1


In [9]:
hw_aes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166105 entries, 0 to 166104
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   image_file_name    166105 non-null  object 
 1   aesthetic_score    166105 non-null  float64
 2   aesthetic_quality  166105 non-null  object 
 3   probability_good   166105 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 5.1+ MB


In [10]:
hw_aes_df['image_file_name'].value_counts()

02_0002_0_21_1016_0809_56.png    1
02_0002_0_24_1016_1486_78.png    1
01_0001_1_16_0916_0119_58.png    1
01_0001_0_17_0916_0155_79.png    1
02_0002_0_23_1016_1485_79.png    1
                                ..
02_0002_0_24_1016_1255_80.png    1
01_0001_0_16_0916_0234_31.png    1
02_0002_0_21_1016_0782_19.png    1
02_0002_0_22_1016_1030_12.png    1
02_0002_1_22_1016_1479_76.png    1
Name: image_file_name, Length: 166105, dtype: int64

In [11]:
# save prepared dataframe as csv file
hw_aes_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', 'aesthetics_all.csv'), index=False)

  
  ## Forming the Train, Validation & Test sets  
  

In [12]:
raw_aestheticScores_df.head()

Unnamed: 0,Form No,Out Of 5 (1st Person),Out of 5 (2nd Person),Out of 5 (3rd Person),Average
0,1,2,4,5,3.666667
1,2,1,3,4,2.666667
2,3,3,5,4,4.0
3,4,2,3,4,3.0
4,5,1,3,3,2.333333


In [13]:
# drop unnecessary columns
raw_aestheticScores_df = raw_aestheticScores_df[['Form No', 'Average']]
raw_aestheticScores_df.head()

Unnamed: 0,Form No,Average
0,1,3.666667
1,2,2.666667
2,3,4.0
3,4,3.0
4,5,2.333333


In [14]:
# create column for labels- 'good', 'bad'
aesthetics_labels = []
for aes_score in raw_aestheticScores_df['Average']:
    if aes_score >= MEAN_AES_SCORE:
        aesthetics_labels.append('good')
    else:
        aesthetics_labels.append('bad')

raw_aestheticScores_df['aesthetic_quality'] = aesthetics_labels

raw_aestheticScores_df.head()

Unnamed: 0,Form No,Average,aesthetic_quality
0,1,3.666667,good
1,2,2.666667,good
2,3,4.0,good
3,4,3.0,good
4,5,2.333333,bad


In [15]:
# split train:test - 80:20 keeping 'good', 'bad' label ratio stratified
train_raw_df, test_raw_df = train_test_split(raw_aestheticScores_df, test_size=0.2, random_state=42, shuffle=True, 
                                             stratify=raw_aestheticScores_df['aesthetic_quality'])

# split train:validation - 80:20 keeping 'good', 'bad' label ratio stratified
train_raw_df, val_raw_df = train_test_split(train_raw_df, test_size=0.2, random_state=42, shuffle=True, 
                                             stratify=train_raw_df['aesthetic_quality'])

In [16]:
train_raw_df['aesthetic_quality'].value_counts()

bad     670
good    610
Name: aesthetic_quality, dtype: int64

In [17]:
val_raw_df['aesthetic_quality'].value_counts()

bad     168
good    152
Name: aesthetic_quality, dtype: int64

In [18]:
test_raw_df['aesthetic_quality'].value_counts()

bad     209
good    191
Name: aesthetic_quality, dtype: int64

In [19]:
#train_raw_df.head()
#val_raw_df.head()
#test_raw_df.head()

In [20]:
train_hwAes_df = prepare_hwAesthetics_df(get_form_aesScores(train_raw_df))
# train_hwAes_df.head()
train_hwAes_df['aesthetic_quality'].value_counts()

bad     55525
good    50703
Name: aesthetic_quality, dtype: int64

In [21]:
val_hwAes_df = prepare_hwAesthetics_df(get_form_aesScores(val_raw_df))
#val_hwAes_df.head()
val_hwAes_df['aesthetic_quality'].value_counts()

bad     13988
good    12487
Name: aesthetic_quality, dtype: int64

In [22]:
test_hwAes_df = prepare_hwAesthetics_df(get_form_aesScores(test_raw_df))
#test_hwAes_df.head()
test_hwAes_df['aesthetic_quality'].value_counts()

bad     17459
good    15943
Name: aesthetic_quality, dtype: int64

In [23]:
# save prepared dataframes as csv file
train_hwAes_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', 'aesthetics_train.csv'), index=False)
val_hwAes_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', 'aesthetics_val.csv'), index=False)
test_hwAes_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', 'aesthetics_test.csv'), index=False)

  
  ## Split the sample images dataset to separate train, validation, test folders  
  

In [24]:
'''
params- dataframe with image file names
returns set of image file names
'''
def get_img_file_names(hwAes_df):
    img_file_names = set()
    for img in hwAes_df['image_file_name']:
        img_file_names.add(img)
    
    return img_file_names

# create set of image file names
train_img_files = get_img_file_names(train_hwAes_df)
val_img_files = get_img_file_names(val_hwAes_df)
test_img_files = get_img_file_names(test_hwAes_df)

In [31]:
print(len(train_img_files)+len(val_img_files)+len(test_img_files))
print(train_img_files&val_img_files, train_img_files&test_img_files, val_img_files&test_img_files)

166105
set() set() set()


In [35]:
# copy (and replace) files to corresponding destination folders
import shutil

images_dir = os.path.join('..', 'dataset', 'BanglaLekha-Isolated', 'Images')
    
for i in range(1, 84+1):
    
    curr_dir = os.path.join(images_dir, str(i))

    for file_name in os.listdir(curr_dir): 
        if os.path.isfile(os.path.join(curr_dir, file_name)) == False:
            continue 
        
        curr_path = os.path.join(curr_dir, file_name)
        dest_path = os.path.join('..', 'dataset', 'prepared-datasets')
        
        dest_folder='who_are_you'        
        if file_name in train_img_files:
            dest_folder = 'train_images'
        elif file_name in val_img_files:
            dest_folder = 'validation_images'
        elif file_name in test_img_files:
            dest_folder = 'test_images'
        
        dest_path = os.path.join(dest_path, dest_folder)
        
        shutil.copy(curr_path, dest_path)