In [1]:
# required for jupyter notebook
%matplotlib inline 

import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split

import os
import shutil

### Upon analysis we found that folder 74 contained image files with '_75' at the end of their file names. The code below fixes such file names.

In [2]:
images_dir = os.path.join('..', 'dataset', 'BanglaLekha-Isolated', 'Images')

for i in range(1, 84+1):
    
    curr_dir = os.path.join(images_dir, str(i))
    
    for file_name in os.listdir(curr_dir): 
        if os.path.isfile(os.path.join(curr_dir, file_name)) == False:
            continue
        
        file_name_parts = file_name.split('_')
        
        file_folder = file_name_parts[6]
        
        if file_folder!=str(i)+'.png':
            new_file_name=''
            for j in range(6):
                new_file_name+=file_name_parts[j]+'_'
            new_file_name+=str(i)+'.png'
            
            #print(file_name, ' -> ', new_file_name)
            os.rename(os.path.join(curr_dir, file_name), os.path.join(curr_dir, new_file_name))

### Mean aesthetic score = 2.455

In [3]:
MEAN_AES_SCORE = 2.455

In [4]:
raw_aestheticScores_df = pd.read_csv(os.path.join('..', 'dataset', 'BanglaLekha-Isolated', 'aesthetic-scores.csv'))

In [5]:
raw_aestheticScores_df.head()

Unnamed: 0,Form No,Out Of 5 (1st Person),Out of 5 (2nd Person),Out of 5 (3rd Person),Average
0,1,2,4,5,3.666667
1,2,1,3,4,2.666667
2,3,3,5,4,4.0
3,4,2,3,4,3.0
4,5,1,3,3,2.333333


In [6]:
'''
returns a map of key->value : form_id->aesthetic_score
'''
def get_form_aesScores(aestheticScores_df):
    
    form_aesScores = {} 
    
    form_ids = []
    aes_scores = []
    for form_id in aestheticScores_df['Form No']:
        form_ids.append(form_id)
    for aes_score in aestheticScores_df['Average']:
        aes_scores.append(aes_score)
        
    for i in range(len(form_ids)):
        form_aesScores[form_ids[i]] = aes_scores[i]
        
    return form_aesScores

In [7]:
'''
create 4 PARALLEL lists- image_file_name[], aesthetic_score[], aesthetic_quality[], prob_good[]
image_file_name[] = name of the sample image file
aesthetic_score[] = aesthetic score assigned in decimal point number
aesthetic_quality[] = label- 'good' or 'bad'
prob_good[] = 1 if good, 0 if bad 

extra columns- prob_1, prob_2, prob_3, prob_4, prob_5
these columns represent probability of score being 1 to 5
meaning,
if a sample's ground truth score is 3.3 
then prob_1=0, prob_2=0, prob_3 = 0.67, prob_4=0.33, prob_5=0

score_labels: labels according to higest probability in prob_1, prob_2, ..., prob_5

returns a dataframe with each list as a column
'''
def prepare_hwAesthetics_df(mp_formWise_aesScores, begin=1, end=84, mean_aesScore = 2.455):

    image_file_names, aesthetic_scores, aesthetic_qualitys, prob_goods = [], [], [], []
    prob_1s, prob_2s, prob_3s, prob_4s, prob_5s = [], [], [], [], []
    score_labels = []
    
    images_dir = os.path.join('..', 'dataset', 'BanglaLekha-Isolated', 'Images')
    
    for i in range(begin, end+1):
        
        curr_dir = os.path.join(images_dir, str(i))
        
        for file_name in os.listdir(curr_dir): 
            if os.path.isfile(os.path.join(curr_dir, file_name)) == False:
                continue
                
            form_id = int(file_name.split('_')[5])
            
            if form_id not in mp_formWise_aesScores:
                continue
            
            aes_score = mp_formWise_aesScores[form_id]
            
            image_file_names.append(file_name)
            aesthetic_scores.append(aes_score)
            if aes_score >= mean_aesScore:
                aesthetic_qualitys.append('good')
                prob_goods.append(1)
            else:
                aesthetic_qualitys.append('bad')
                prob_goods.append(0)
            
            # if score=3.3, score_probs= [0.0, 0.0, 0.67, 0.33, 0.0] 
            score_probs = [0.0, 0.0, 0.0, 0.0, 0.0]
            score_probs[math.floor(aes_score)-1] = 1.0 - (aes_score - float(math.floor(aes_score)))
            if math.floor(aes_score) != math.ceil(aes_score):
                score_probs[math.ceil(aes_score)-1] = (aes_score - float(math.floor(aes_score)))
            score_probs = [round(x, 2) for x in score_probs]
            prob_1s.append(score_probs[0])
            prob_2s.append(score_probs[1])
            prob_3s.append(score_probs[2])
            prob_4s.append(score_probs[3])
            prob_5s.append(score_probs[4])
            
            score_labels.append(round(aes_score))
            
                
    return pd.DataFrame({'image_file_name': image_file_names,
                       'aesthetic_score': aesthetic_scores,
                       'aesthetic_quality': aesthetic_qualitys,
                       'probability_good': prob_goods, 
                        'prob_1': prob_1s,
                        'prob_2': prob_2s,
                        'prob_3': prob_3s,
                        'prob_4': prob_4s,
                        'prob_5': prob_5s,
                        'score_label': score_labels})

In [8]:
mp_formWise_aesScores = get_form_aesScores(raw_aestheticScores_df)
print(len(mp_formWise_aesScores))

hw_aes_df = prepare_hwAesthetics_df(mp_formWise_aesScores)
hw_aes_df.head()

2000


Unnamed: 0,image_file_name,aesthetic_score,aesthetic_quality,probability_good,prob_1,prob_2,prob_3,prob_4,prob_5,score_label
0,02_0002_0_23_1016_1067_1.png,1.333333,bad,0,0.67,0.33,0.0,0.0,0.0,1
1,02_0002_0_23_1016_1730_1.png,1.333333,bad,0,0.67,0.33,0.0,0.0,0.0,1
2,01_0001_0_15_0916_0422_1.png,3.666667,good,1,0.0,0.0,0.33,0.67,0.0,4
3,02_0002_0_24_1016_1542_1.png,2.0,bad,0,0.0,1.0,0.0,0.0,0.0,2
4,01_0001_1_16_0916_0320_1.png,4.0,good,1,0.0,0.0,0.0,1.0,0.0,4


In [9]:
hw_aes_df.tail()

Unnamed: 0,image_file_name,aesthetic_score,aesthetic_quality,probability_good,prob_1,prob_2,prob_3,prob_4,prob_5,score_label
166100,02_0002_0_20_1016_0800_84.png,1.0,bad,0,1.0,0.0,0.0,0.0,0.0,1
166101,01_0001_1_18_0916_1880_84.png,4.0,good,1,0.0,0.0,0.0,1.0,0.0,4
166102,02_0002_0_20_1016_0877_84.png,1.0,bad,0,1.0,0.0,0.0,0.0,0.0,1
166103,02_0002_0_21_1016_1395_84.png,1.666667,bad,0,0.33,0.67,0.0,0.0,0.0,2
166104,02_0002_0_19_1016_0812_84.png,2.666667,good,1,0.0,0.33,0.67,0.0,0.0,3


In [10]:
hw_aes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166105 entries, 0 to 166104
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   image_file_name    166105 non-null  object 
 1   aesthetic_score    166105 non-null  float64
 2   aesthetic_quality  166105 non-null  object 
 3   probability_good   166105 non-null  int64  
 4   prob_1             166105 non-null  float64
 5   prob_2             166105 non-null  float64
 6   prob_3             166105 non-null  float64
 7   prob_4             166105 non-null  float64
 8   prob_5             166105 non-null  float64
 9   score_label        166105 non-null  int64  
dtypes: float64(6), int64(2), object(2)
memory usage: 12.7+ MB


In [11]:
hw_aes_df['score_label'].value_counts()

3    49931
2    49851
1    37121
4    26347
5     2855
Name: score_label, dtype: int64

In [12]:
# save prepared dataframe as csv file
hw_aes_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', 'aesthetics_all.csv'), index=False)

  
  ## Forming the Train, Validation & Test sets for 'Good', 'Bad' 
  

In [13]:
raw_aestheticScores_df.head()

Unnamed: 0,Form No,Out Of 5 (1st Person),Out of 5 (2nd Person),Out of 5 (3rd Person),Average
0,1,2,4,5,3.666667
1,2,1,3,4,2.666667
2,3,3,5,4,4.0
3,4,2,3,4,3.0
4,5,1,3,3,2.333333


In [14]:
# drop unnecessary columns
raw_aestheticScores_df = raw_aestheticScores_df[['Form No', 'Average']]
raw_aestheticScores_df.head()

Unnamed: 0,Form No,Average
0,1,3.666667
1,2,2.666667
2,3,4.0
3,4,3.0
4,5,2.333333


In [15]:
# create column for labels- 'good', 'bad'
aesthetics_labels = []
for aes_score in raw_aestheticScores_df['Average']:
    if aes_score >= MEAN_AES_SCORE:
        aesthetics_labels.append('good')
    else:
        aesthetics_labels.append('bad')

raw_aestheticScores_df['aesthetic_quality'] = aesthetics_labels

raw_aestheticScores_df.head()

Unnamed: 0,Form No,Average,aesthetic_quality
0,1,3.666667,good
1,2,2.666667,good
2,3,4.0,good
3,4,3.0,good
4,5,2.333333,bad


In [16]:
'''
params- raw_df_with_label: dataframe created using original dataset's aesthetic scores in with class label
        label_name: name of the class label, such as- 'aesthetic_quality', 'score_label'
returns- train, test, validation dataframes split in 80:20 ratio
'''
def prep_train_val_test_dfs(raw_df_with_label, label_name):
    # split train:test - 80:20 keeping label ratio stratified
    train_raw_df, test_raw_df = train_test_split(raw_df_with_label, test_size=0.2, random_state=42, shuffle=True, 
                                                 stratify=raw_df_with_label[label_name])

    # split train:validation - 80:20 keeping label ratio stratified
    train_raw_df, val_raw_df = train_test_split(train_raw_df, test_size=0.2, random_state=42, shuffle=True, 
                                                 stratify=train_raw_df[label_name])
    
    return prepare_hwAesthetics_df(get_form_aesScores(train_raw_df)), prepare_hwAesthetics_df(get_form_aesScores(val_raw_df)),prepare_hwAesthetics_df(get_form_aesScores(test_raw_df))

In [17]:
train_hwAes_df, val_hwAes_df, test_hwAes_df = prep_train_val_test_dfs(raw_aestheticScores_df, 'aesthetic_quality')

In [18]:
train_hwAes_df['aesthetic_quality'].value_counts()

bad     55525
good    50703
Name: aesthetic_quality, dtype: int64

In [19]:
val_hwAes_df['aesthetic_quality'].value_counts()

bad     13988
good    12487
Name: aesthetic_quality, dtype: int64

In [20]:
test_hwAes_df['aesthetic_quality'].value_counts()

bad     17459
good    15943
Name: aesthetic_quality, dtype: int64

In [21]:
# drop columns not related to aesthetic quality
train_hwAes_df = train_hwAes_df.drop(columns=['prob_1', 'prob_2', 'prob_3', 'prob_4', 'prob_5', 'score_label'])
val_hwAes_df = val_hwAes_df.drop(columns=['prob_1', 'prob_2', 'prob_3', 'prob_4', 'prob_5', 'score_label'])
test_hwAes_df = test_hwAes_df.drop(columns=['prob_1', 'prob_2', 'prob_3', 'prob_4', 'prob_5', 'score_label'])

# save prepared dataframes as csv file
train_hwAes_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_train.csv'), index=False)
val_hwAes_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_val.csv'), index=False)
test_hwAes_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_test.csv'), index=False)

  
  ## Split the sample images dataset to separate train, validation, test folders for 2 classes
  

In [22]:
'''
params- dataframe with image file names
returns set of image file names
'''
def get_img_file_names(hwAes_df):
    img_file_names = set()
    for img in hwAes_df['image_file_name']:
        img_file_names.add(img)
    
    return img_file_names

def prep_train_val_test_images(train_df, val_df, test_df, dest_folder_name):
    # create set of image file names
    train_img_files = get_img_file_names(train_df)
    val_img_files = get_img_file_names(val_df)
    test_img_files = get_img_file_names(test_df)

    print(len(train_img_files)+len(val_img_files)+len(test_img_files))
    print(train_img_files&val_img_files, train_img_files&test_img_files, val_img_files&test_img_files)
    
    # copy (and replace) files to corresponding destination folders

    images_dir = os.path.join('..', 'dataset', 'BanglaLekha-Isolated', 'Images')

    for i in range(1, 84+1):
        curr_dir = os.path.join(images_dir, str(i))

        for file_name in os.listdir(curr_dir): 
            if os.path.isfile(os.path.join(curr_dir, file_name)) == False:
                continue 

            curr_path = os.path.join(curr_dir, file_name)
            dest_path = os.path.join('..', 'dataset', 'prepared-datasets', dest_folder_name)

            dest_folder='unknown'        
            if file_name in train_img_files:
                dest_folder = 'train_images'
            elif file_name in val_img_files:
                dest_folder = 'validation_images'
            elif file_name in test_img_files:
                dest_folder = 'test_images'

            dest_path = os.path.join(dest_path, dest_folder)

            shutil.copy(curr_path, dest_path)
            
    print('images test-validation-train partition created')

In [23]:
prep_train_val_test_images(train_hwAes_df, val_hwAes_df, test_hwAes_df, '2_class')

166105
set() set() set()
images test-validation-train partition created


  
  ## Forming the Train, Validation & Test sets for score_label = [1, 5] 
  

In [24]:
# drop unnecessary columns
raw_scoreLabels_df = raw_aestheticScores_df[['Form No', 'Average']]
raw_scoreLabels_df.head()

Unnamed: 0,Form No,Average
0,1,3.666667
1,2,2.666667
2,3,4.0
3,4,3.0
4,5,2.333333


In [25]:
# create column for score_labels- 1, 2, 3, 4, 5
score_labels = [round(aes_score) for aes_score in raw_aestheticScores_df['Average']]

raw_scoreLabels_df['score_label'] = score_labels

raw_scoreLabels_df.head()

Unnamed: 0,Form No,Average,score_label
0,1,3.666667,4
1,2,2.666667,3
2,3,4.0,4
3,4,3.0,3
4,5,2.333333,2


In [26]:
raw_scoreLabels_df['score_label'].value_counts()

3    599
2    596
1    451
4    319
5     35
Name: score_label, dtype: int64

In [27]:
train_hwScore_df, val_hwScore_df, test_hwScore_df = prep_train_val_test_dfs(raw_scoreLabels_df, 'score_label')

In [28]:
train_hwScore_df['score_label'].value_counts()

3    32086
2    31949
1    23865
4    16950
5     1847
Name: score_label, dtype: int64

In [29]:
val_hwScore_df['score_label'].value_counts()

2    7948
3    7882
1    5798
4    4028
5     420
Name: score_label, dtype: int64

In [30]:
test_hwScore_df['score_label'].value_counts()

3    9963
2    9954
1    7458
4    5369
5     588
Name: score_label, dtype: int64

In [31]:
# remove information not related to score_label
train_hwScore_df = train_hwScore_df.drop(columns=['aesthetic_quality', 'probability_good'])
val_hwScore_df = val_hwScore_df.drop(columns=['aesthetic_quality', 'probability_good'])
test_hwScore_df = test_hwScore_df.drop(columns=['aesthetic_quality', 'probability_good'])

# save prepared dataframes as csv file
train_hwScore_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '5_class', 'scoreLabel_train.csv'), index=False)
val_hwScore_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '5_class', 'scoreLabel_val.csv'), index=False)
test_hwScore_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '5_class', 'scoreLabel_test.csv'), index=False)

  
  ## Split the sample images dataset to separate train, validation, test folders for 5 classes
  

In [32]:
prep_train_val_test_images(train_hwScore_df, val_hwScore_df, test_hwScore_df, '5_class')

166105
set() set() set()
images test-validation-train partition created


## Set actual probability_good value to 2_class csv files for Binary Classification

In [2]:
def fix_bin_classification_csv(df):
    aes_scores = df['aesthetic_score']
    actual_prob_goods = [score/5.0 for score in aes_scores]
    df['actual_prob_good'] = actual_prob_goods
    
    return df

In [5]:
train_df = pd.read_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_train.csv'))
train_df = fix_bin_classification_csv(train_df)
train_df.head()

Unnamed: 0,image_file_name,aesthetic_score,aesthetic_quality,probability_good,actual_prob_good
0,02_0002_0_23_1016_1067_1.png,1.333333,bad,0,0.266667
1,02_0002_0_23_1016_1730_1.png,1.333333,bad,0,0.266667
2,02_0002_0_24_1016_1542_1.png,2.0,bad,0,0.4
3,01_0001_1_16_0916_0320_1.png,4.0,good,1,0.8
4,02_0002_0_21_1016_0988_1.png,1.333333,bad,0,0.266667


In [7]:
val_df = pd.read_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_val.csv'))
val_df = fix_bin_classification_csv(val_df)
val_df.head()

Unnamed: 0,image_file_name,aesthetic_score,aesthetic_quality,probability_good,actual_prob_good
0,01_0001_0_15_0916_0422_1.png,3.666667,good,1,0.733333
1,01_0001_1_17_0916_0332_1.png,4.0,good,1,0.8
2,02_0002_0_18_1016_1585_1.png,2.0,bad,0,0.4
3,02_0002_0_20_1016_0777_1.png,2.333333,bad,0,0.466667
4,02_0002_0_10_1116_1994_1.png,1.0,bad,0,0.2


In [8]:
test_df = pd.read_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_test.csv'))
test_df = fix_bin_classification_csv(test_df)
test_df.head()

Unnamed: 0,image_file_name,aesthetic_score,aesthetic_quality,probability_good,actual_prob_good
0,02_0002_0_20_1016_1268_1.png,1.0,bad,0,0.2
1,01_0001_0_18_0916_0260_1.png,3.333333,good,1,0.666667
2,02_0002_0_21_1016_1119_1.png,3.666667,good,1,0.733333
3,01_0001_0_19_0916_1895_1.png,3.0,good,1,0.6
4,02_0002_0_22_1016_1496_1.png,1.333333,bad,0,0.266667


In [9]:
train_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_train.csv'), index=False)
val_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_val.csv'), index=False)
test_df.to_csv(os.path.join('..', 'dataset', 'prepared-datasets', '2_class', 'aesthetics_test.csv'), index=False)