This notebook takes the icdar training data and generates a csv file with writer,same_text,isEng,train,file_name,male columns (file_name is the absolute path)

In [88]:
#imports
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
import random

In [89]:
#test functions
def male_counts(sex_df):
    # Get the counts of each unique value in the "male" column
    male_counts = sex_df['male'].value_counts(dropna=False)

    # Print the counts
    print("Number of times 'male' is 0:", male_counts.get(0, 0))
    print("Number of times 'male' is 1:", male_counts.get(1, 0))
    print("Number of times 'male' is something else:", len(sex_df) - male_counts.get(0, 0) - male_counts.get(1, 0))
def check_if_both(train_df, column_name='same_text'):
    # Group by writer and check if both same_text=1 and same_text=0 are present
    writer_groups = train_df.groupby('writer')[column_name].nunique()

    # Filter writers that do not have both same_text=1 and same_text=0
    writers_missing_both = writer_groups[writer_groups != 2]

    if writers_missing_both.empty:
        print(f"All writers have both {column_name}=1 and {column_name}=0.")
    else:
        print(f"The following writers do not have {column_name}=1 and {column_name}=0")
        print(writers_missing_both)
def check_randomization(train_df):
    # Get the number of rows where train == 1
    train_1_count = train_df[train_df['train'] == 1].shape[0]

    # Calculate the fraction
    train_1_fraction = train_1_count / train_df.shape[0]

    print(f"Number of rows where train == 1: {train_1_count}")
    print(f"Fraction of rows where train == 1: {train_1_fraction:.2f}")
def check_grouping(train_df):
    # Group by writer and check if the train column has a constant value
    constant_train_check = train_df.groupby('writer')['train'].nunique()

    # Find writers where the train column is not constant
    non_constant_writers = constant_train_check[constant_train_check > 1]

    if non_constant_writers.empty:
        print("The train column is constant for all writers.")
    else:
        print("The train column is not constant for the following writers:")
        print(non_constant_writers)
def check_occurrences(train_df):
    # Count the occurrences of each unique writer value
    writer_counts = train_df['writer'].value_counts()

    # Check if all writers have exactly 4 occurrences
    if (writer_counts == 4).all():
        print("Each unique writer value occurs on exactly 4 rows.")
    else:
        print("Some writers do not occur exactly 4 times.")
        print(writer_counts[writer_counts != 4])
def check_title_association(train_df):
    random_numbers = random.sample(range(1, 282*4+1), 10)
    for n in random_numbers:
        print(n)
        print(train_df['file_name'][n])
        print(train_df['writer'][n],train_df['isEng'][n], train_df['same_text'][n])
        print('-------------')
def check_sex_association(train_df,sex_df):
    random_numbers = random.sample(range(1, 283), 10)
    for n in random_numbers:
        print(n)
        print(train_df[train_df['writer'] == n][['writer','male']])
        print(sex_df[sex_df['writer'] == n][['writer','male']])
        print('-------------')
def check_if_seed(train_df):
    train_0_writers = train_df[train_df['train'] == 0]['writer'].unique().tolist()
    train_1_writers = train_df[train_df['train'] == 1]['writer'].unique().tolist()
    return train_0_writers, train_1_writers


In [128]:
# Set the random seed for reproducibility
seed=42
np.random.seed(seed)

In [129]:
data_PATH="D:\\download\\PD project\\datasets\\ICDAR 2013 - Gender Identification Competition Dataset"
image_PATH=data_PATH+"\\unzipped"
source_path="D:\\burtm\\Visual_studio_code\\PD_related_projects"

In [130]:
sex_df = pd.read_csv(os.path.join(data_PATH, "train_answers.csv"),delimiter=',')
sex_df.head(15)

Unnamed: 0,writer,male
0,1,0
1,2,0
2,3,0
3,4,0
4,5,1
5,6,1
6,7,1
7,8,1
8,9,0
9,10,1


In [131]:
male_counts(sex_df)

Number of times 'male' is 0: 143
Number of times 'male' is 1: 139
Number of times 'male' is something else: 0


In [132]:
# Set the probability of being 0
p_train = 0.9
N=282

# Create a dataframe with writer column from 1 to 282
writers_df = pd.DataFrame({'writer': np.arange(1, N+1)})

# Add a train column that is randomly 0 or 1 with probability p of being 0
writers_df['train'] = np.random.choice([0, 1], size=len(writers_df), p=[1-p_train, p_train])

In [133]:
val,train = check_if_seed(writers_df)
print(val)
print('val should be (previous run): \n [7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260])')

[7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260]
val should be (previous run): 
 [7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260])


In [134]:
folder_names = [folder for folder in os.listdir(image_PATH) if os.path.isdir(os.path.join(image_PATH, folder))]
# Extract the X part from the folder names
x_values = [int(folder.split('_')[0]) for folder in folder_names]

# Sort both lists based on the X values
sorted_indices = sorted(range(len(x_values)), key=lambda k: x_values[k])
folder_names = [folder_names[i] for i in sorted_indices]
x_values = [x_values[i] for i in sorted_indices]
print(folder_names)

['1_50', '51_100', '101_150', '151_200', '201_250', '251_300']


In [137]:
# Loop through each directory and collect image file paths for labeled images only
image_dirs = [os.path.join(image_PATH, folder) for folder in folder_names]
writers = []
isEng = []
same_text = []
file_names = []

for image_dir in image_dirs:
    for f in os.listdir(image_dir):
        if f.endswith('.jpg'):
            base_name = os.path.splitext(f)[0]  # Remove extension
            parts = base_name.split('_')

            if len(parts) != 2:
                continue  # Skip files that don't follow the expected pattern

            index, version = parts

            if int(version)>2:
                isEng.append(1)
            else:
                isEng.append(0)
            if int(version)%2==0:
                same_text.append(1)
            else:
                same_text.append(0)
            file_names.append(os.path.join(image_dir,f))
            writers.append(int(index))

# Create a dataframe from the extracted index and version values
train_file_df = pd.DataFrame({'writer': writers, 'isEng': isEng, 'same_text': same_text,'file_name':file_names})

# Display the dataframe
print(train_file_df['writer'].nunique())

'''print(train_file_df['writer'].min())
print(train_file_df['writer'].max())
# Check which writer values are missing in the interval 1-300
all_writers = set(range(1, 301))
present_writers = set(train_file_df['writer'].unique())
missing_writers = all_writers - present_writers

print(f"Missing writers in the interval 1-300: {sorted(missing_writers)}")'''

check_if_both(train_file_df,column_name='same_text')
check_if_both(train_file_df, column_name='isEng')

300
All writers have both same_text=1 and same_text=0.
All writers have both isEng=1 and isEng=0.


In [138]:
train_df = train_file_df[train_file_df['writer']<=N]
train_df = train_df.merge(sex_df, on=['writer'], how='left')
train_df = train_df.merge(writers_df, on=['writer'], how='left')
# Display the updated dataframe
train_df.head(10)

Unnamed: 0,writer,isEng,same_text,file_name,male,train
0,1,0,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
1,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
2,1,1,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
3,1,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
4,2,0,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
5,2,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
6,2,1,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
7,2,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
8,3,0,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
9,3,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1


In [139]:
train_df['index'] = train_df.index

# Display the updated dataframe
train_df.head(10)

Unnamed: 0,writer,isEng,same_text,file_name,male,train,index
0,1,0,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,0
1,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,1
2,1,1,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,2
3,1,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,3
4,2,0,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,4
5,2,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,5
6,2,1,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,6
7,2,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,7
8,3,0,0,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,8
9,3,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1,9


In [140]:
check_sex_association(train_df,sex_df)

113
     writer  male
448     113     0
449     113     0
450     113     0
451     113     0
     writer  male
112     113     0
-------------
185
     writer  male
736     185     1
737     185     1
738     185     1
739     185     1
     writer  male
184     185     1
-------------
269
      writer  male
1072     269     0
1073     269     0
1074     269     0
1075     269     0
     writer  male
268     269     0
-------------
278
      writer  male
1108     278     0
1109     278     0
1110     278     0
1111     278     0
     writer  male
277     278     0
-------------
33
     writer  male
128      33     0
129      33     0
130      33     0
131      33     0
    writer  male
32      33     0
-------------
181
     writer  male
720     181     1
721     181     1
722     181     1
723     181     1
     writer  male
180     181     1
-------------
262
      writer  male
1044     262     0
1045     262     0
1046     262     0
1047     262     0
     writer  male
261     262 

In [141]:
check_title_association(train_df)

239
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\51_100\0060_4.jpg
60 1 1
-------------
951
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\201_250\0238_4.jpg
238 1 1
-------------
141
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\1_50\0036_2.jpg
36 0 1
-------------
1023
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\251_300\0256_4.jpg
256 1 1
-------------
883
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\201_250\0221_4.jpg
221 1 1
-------------
956
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\201_250\0240_1.jpg
240 0 0
-------------
731
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\151_200\0183_4.jpg
183 1 1
-------------
444
D:\download\PD

In [142]:
male_counts(train_df)
check_if_both(train_df, column_name='same_text')
check_if_both(train_df, column_name='isEng') 
check_randomization(train_df)
check_grouping(train_df)
check_occurrences(train_df)


Number of times 'male' is 0: 572
Number of times 'male' is 1: 556
Number of times 'male' is something else: 0
All writers have both same_text=1 and same_text=0.
All writers have both isEng=1 and isEng=0.
Number of rows where train == 1: 1012
Fraction of rows where train == 1: 0.90
The train column is constant for all writers.
Each unique writer value occurs on exactly 4 rows.


In [143]:
import json

def get_base_metadata(filepath):
    stats = os.stat(filepath)
    return {
        "full_path": os.path.abspath(filepath),
        "size_bytes": stats.st_size,
        "created": datetime.fromtimestamp(stats.st_ctime).isoformat(),
        "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(),
        "accessed": datetime.fromtimestamp(stats.st_atime).isoformat()
    }

def load_log(path):
    if os.path.exists(path):
        with open(path, 'r') as f:
            return json.load(f)
    return {}

def save_log(data, path):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

def add_or_update_file(filepath, log_path, custom_metadata=None):
    """
    Adds or updates a file's metadata entry, including custom metadata.
    """
    if not os.path.isfile(filepath):
        print(f"File not found: {filepath}")
        return
    
    filename = os.path.basename(filepath)
    log = load_log(log_path)

    base_meta = get_base_metadata(filepath)
    entry = log.get(filename, {})

    # Combine existing metadata, new base, and new custom metadata
    entry.update(base_meta)
    if custom_metadata:
        entry.update(custom_metadata)

    log[filename] = entry
    save_log(log, log_path)
    print(f"Updated log for {filename}")

def read_metadata(filepath, log_path):
    """
    Adds or updates a file's metadata entry, including custom metadata.
    """
    if not os.path.isfile(filepath):
        print(f"File not found: {filepath}")
        return
    
    filename = os.path.basename(filepath)
    log = load_log(log_path)

    entry = log.get(filename, None)
    if entry:
        print(f"Metadata for {filename}:")
        for key, value in entry.items():
            print(f"{key}: {value}")
    else:
        print(f"No metadata found for {filename}")


In [144]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join(source_path, "outputs", "preprocessed_data")
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

output_file = os.path.join(output_dir, f"icdar_train_df_{timestamp}.csv")
train_df.to_csv(output_file, index=False)



print(f"Dataframe saved to {output_file}")

Dataframe saved to D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\icdar_train_df_20250514_175905.csv


In [146]:
# Example usage:
LOG_FILE = output_dir+"\\file_metadata_log.json"
print(f"Log file path: {LOG_FILE}")
print(f"Output file path: {output_file}")
add_or_update_file(
    output_file, LOG_FILE,
    custom_metadata={
        "seed": seed,
        "description": '''dataframe with the following columns: writer, language, same_text, isEng, train, filename, index; 
        Each row is one of the original dataset image files. I have simplified the code, previously it was unnecessarily complicated''' 
    }
)

Log file path: D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\file_metadata_log.json
Output file path: D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\icdar_train_df_20250514_175905.csv
Updated log for icdar_train_df_20250514_175905.csv


In [147]:
read_metadata(
    output_file,
    log_path=LOG_FILE
)

Metadata for icdar_train_df_20250514_175905.csv:
full_path: D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\icdar_train_df_20250514_175905.csv
size_bytes: 147733
created: 2025-05-14T17:59:05.299873
modified: 2025-05-14T17:59:05.604248
accessed: 2025-05-14T17:59:05.604248
seed: 42
description: dataframe with the following columns: writer, language, same_text, isEng, train, filename, index; 
        Each row is one of the original dataset image files. I have simplified the code, previously it was unnecessarily complicated
