This notebook takes the icdar training data and generates a csv file with writer,same_text,isEng,train,file_name,male columns (file_name is the absolute path)

In [88]:
#imports
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
import random

In [89]:
#test functions
def male_counts(sex_df):
    # Get the counts of each unique value in the "male" column
    male_counts = sex_df['male'].value_counts(dropna=False)

    # Print the counts
    print("Number of times 'male' is 0:", male_counts.get(0, 0))
    print("Number of times 'male' is 1:", male_counts.get(1, 0))
    print("Number of times 'male' is something else:", len(sex_df) - male_counts.get(0, 0) - male_counts.get(1, 0))
def check_if_both(train_df, column_name='same_text'):
    # Group by writer and check if both same_text=1 and same_text=0 are present
    writer_groups = train_df.groupby('writer')[column_name].nunique()

    # Filter writers that do not have both same_text=1 and same_text=0
    writers_missing_both = writer_groups[writer_groups != 2]

    if writers_missing_both.empty:
        print(f"All writers have both {column_name}=1 and {column_name}=0.")
    else:
        print(f"The following writers do not have {column_name}=1 and {column_name}=0")
        print(writers_missing_both)
def check_randomization(train_df):
    # Get the number of rows where train == 1
    train_1_count = train_df[train_df['train'] == 1].shape[0]

    # Calculate the fraction
    train_1_fraction = train_1_count / train_df.shape[0]

    print(f"Number of rows where train == 1: {train_1_count}")
    print(f"Fraction of rows where train == 1: {train_1_fraction:.2f}")
def check_grouping(train_df):
    # Group by writer and check if the train column has a constant value
    constant_train_check = train_df.groupby('writer')['train'].nunique()

    # Find writers where the train column is not constant
    non_constant_writers = constant_train_check[constant_train_check > 1]

    if non_constant_writers.empty:
        print("The train column is constant for all writers.")
    else:
        print("The train column is not constant for the following writers:")
        print(non_constant_writers)
def check_occurrences(train_df):
    # Count the occurrences of each unique writer value
    writer_counts = train_df['writer'].value_counts()

    # Check if all writers have exactly 4 occurrences
    if (writer_counts == 4).all():
        print("Each unique writer value occurs on exactly 4 rows.")
    else:
        print("Some writers do not occur exactly 4 times.")
        print(writer_counts[writer_counts != 4])
def check_title_association(train_df):
    random_numbers = random.sample(range(1, 282*4+1), 10)
    for n in random_numbers:
        print(n)
        print(train_df['file_name'][n])
        print(train_df['writer'][n],train_df['isEng'][n], train_df['same_text'][n])
        print('-------------')
def check_sex_association(train_df,sex_df):
    random_numbers = random.sample(range(1, 283), 10)
    for n in random_numbers:
        print(n)
        print(train_df[train_df['writer'] == n][['writer','male']])
        print(sex_df[sex_df['writer'] == n][['writer','male']])
        print('-------------')
def check_if_seed(train_df):
    train_0_writers = train_df[train_df['train'] == 0]['writer'].unique().tolist()
    train_1_writers = train_df[train_df['train'] == 1]['writer'].unique().tolist()
    return train_0_writers, train_1_writers


In [90]:
# Set the random seed for reproducibility
seed=42
np.random.seed(seed)

In [91]:
data_PATH="D:\\download\\PD project\\datasets\\ICDAR 2013 - Gender Identification Competition Dataset"
image_PATH=data_PATH+"\\unzipped"
source_path="D:\\burtm\\Visual_studio_code\\PD_related_projects"
train_df_complete = pd.read_csv(os.path.join(data_PATH, "train\\train.csv"))

In [92]:
train_df_complete.head(10)

Unnamed: 0,writer,page_id,language,same_text,tortuosityHist10[0],tortuosityHist10[1],tortuosityHist10[2],tortuosityHist10[3],tortuosityHist10[4],tortuosityHist10[5],...,directions_hist1a2a3a4a5a6a7a8a9a10_220[210],directions_hist1a2a3a4a5a6a7a8a9a10_220[211],directions_hist1a2a3a4a5a6a7a8a9a10_220[212],directions_hist1a2a3a4a5a6a7a8a9a10_220[213],directions_hist1a2a3a4a5a6a7a8a9a10_220[214],directions_hist1a2a3a4a5a6a7a8a9a10_220[215],directions_hist1a2a3a4a5a6a7a8a9a10_220[216],directions_hist1a2a3a4a5a6a7a8a9a10_220[217],directions_hist1a2a3a4a5a6a7a8a9a10_220[218],directions_hist1a2a3a4a5a6a7a8a9a10_220[219]
0,1,1,Arabic,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002457,0.002633,0.002698,0.002929,0.003014,0.003225,0.003572,0.003957,0.004232,0.004804
1,1,2,Arabic,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002498,0.002641,0.002836,0.002999,0.003246,0.003456,0.003709,0.003994,0.004308,0.004706
2,1,3,English,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.00209,0.00244,0.002831,0.003078,0.003438,0.003732,0.0041,0.004329,0.004687,0.004879
3,1,4,English,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002435,0.002823,0.003117,0.003441,0.003788,0.004056,0.004285,0.004527,0.004843,0.005085
4,2,1,Arabic,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002465,0.002724,0.003079,0.003305,0.003425,0.003403,0.00345,0.003626,0.003977,0.004547
5,2,2,Arabic,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002535,0.002808,0.002942,0.003151,0.003247,0.003246,0.003433,0.003516,0.003787,0.004377
6,2,3,English,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002887,0.003271,0.003569,0.003793,0.004049,0.004214,0.004339,0.004598,0.004874,0.005211
7,2,4,English,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002946,0.003295,0.003569,0.003846,0.004083,0.004329,0.004566,0.00485,0.005092,0.005397
8,3,1,Arabic,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002245,0.002325,0.002383,0.002515,0.002725,0.003046,0.003304,0.003629,0.004037,0.004564
9,3,2,Arabic,1,0.92875,0.004071,0.001876,0.0,0.018031,0.025427,...,0.002179,0.002267,0.002385,0.00255,0.002723,0.002836,0.003049,0.003374,0.003757,0.004354


In [93]:
sex_df = pd.read_csv(os.path.join(data_PATH, "train_answers.csv"),delimiter=',')
sex_df.head(15)

Unnamed: 0,writer,male
0,1,0
1,2,0
2,3,0
3,4,0
4,5,1
5,6,1
6,7,1
7,8,1
8,9,0
9,10,1


In [94]:
male_counts(sex_df)

Number of times 'male' is 0: 143
Number of times 'male' is 1: 139
Number of times 'male' is something else: 0


In [95]:
selected_columns = ['writer', 'language', 'same_text']
train_df = train_df_complete[selected_columns]

# Convert the 'language' column to binary columns
language_dummies = pd.get_dummies(train_df['language'], drop_first=True)

# Add the binary column to the dataframe
train_df['isEng'] = language_dummies.iloc[:, 0].astype(int)

# Drop the original 'language' column
train_df.drop(columns=['language'], inplace=True)

# Display the updated dataframe
train_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['isEng'] = language_dummies.iloc[:, 0].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(columns=['language'], inplace=True)


Unnamed: 0,writer,same_text,isEng
0,1,0,0
1,1,1,0
2,1,0,1
3,1,1,1
4,2,0,0
5,2,1,0
6,2,0,1
7,2,1,1
8,3,0,0
9,3,1,0


In [96]:
check_if_both(train_df)
check_if_both(train_df, column_name='isEng')

All writers have both same_text=1 and same_text=0.
All writers have both isEng=1 and isEng=0.


In [97]:
# Set the probability of being 0
p_train = 0.9
N=282

# Create a dataframe with writer column from 1 to 282
writers_df = pd.DataFrame({'writer': np.arange(1, N+1)})

# Add a train column that is randomly 0 or 1 with probability p of being 0
writers_df['train'] = np.random.choice([0, 1], size=len(writers_df), p=[1-p_train, p_train])

# Merge with the train_df dataframe on the writer column
train_df = train_df.merge(writers_df, on='writer', how='left')

# Display the dataframe
train_df.head()

Unnamed: 0,writer,same_text,isEng,train
0,1,0,0,1
1,1,1,0,1
2,1,0,1,1
3,1,1,1,1
4,2,0,0,1


In [98]:
val,train = check_if_seed(train_df)
print(val)
print('val should be (previous run): \n [7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260])')

[7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260]
val should be (previous run): 
 [7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260])


In [99]:
check_randomization(train_df)

Number of rows where train == 1: 1012
Fraction of rows where train == 1: 0.90


In [100]:
check_grouping(train_df)

The train column is constant for all writers.


In [101]:
check_occurrences(train_df)

Each unique writer value occurs on exactly 4 rows.


In [102]:
folder_names = [folder for folder in os.listdir(image_PATH) if os.path.isdir(os.path.join(image_PATH, folder))]
# Extract the X part from the folder names
x_values = [int(folder.split('_')[0]) for folder in folder_names]

# Sort both lists based on the X values
sorted_indices = sorted(range(len(x_values)), key=lambda k: x_values[k])
folder_names = [folder_names[i] for i in sorted_indices]
x_values = [x_values[i] for i in sorted_indices]
print(folder_names)

['1_50', '51_100', '101_150', '151_200', '201_250', '251_300']


In [103]:
# Loop through each directory and collect image file paths for labeled images only
image_dirs = [os.path.join(image_PATH, folder) for folder in folder_names]
writers = []
isEng = []
same_text = []
file_names = []

for image_dir in image_dirs:
    for f in os.listdir(image_dir):
        if f.endswith('.jpg'):
            base_name = os.path.splitext(f)[0]  # Remove extension
            parts = base_name.split('_')

            if len(parts) != 2:
                continue  # Skip files that don't follow the expected pattern

            index, version = parts

            if int(version)>2:
                isEng.append(1)
            else:
                isEng.append(0)
            if int(version)%2==0:
                same_text.append(1)
            else:
                same_text.append(0)
            file_names.append(os.path.join(image_dir,f))
            writers.append(int(index))

# Create a dataframe from the extracted index and version values
train_file_df = pd.DataFrame({'writer': writers, 'isEng': isEng, 'same_text': same_text,'file_name':file_names})

# Display the dataframe
print(train_file_df['writer'].nunique())

'''print(train_file_df['writer'].min())
print(train_file_df['writer'].max())
# Check which writer values are missing in the interval 1-300
all_writers = set(range(1, 301))
present_writers = set(train_file_df['writer'].unique())
missing_writers = all_writers - present_writers

print(f"Missing writers in the interval 1-300: {sorted(missing_writers)}")'''

check_if_both(train_file_df,column_name='same_text')
check_if_both(train_file_df, column_name='isEng')

300
All writers have both same_text=1 and same_text=0.
All writers have both isEng=1 and isEng=0.


In [104]:
check_occurrences(train_df)

Each unique writer value occurs on exactly 4 rows.


In [105]:
# Merge the file_name column to the train_df dataframe based on the writer column
train_df = train_df.merge(train_file_df, on=['isEng','writer','same_text'], how='left')

# Display the updated dataframe
train_df.head(10)

Unnamed: 0,writer,same_text,isEng,train,file_name
0,1,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...
1,1,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...
2,1,0,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...
3,1,1,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...
4,2,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...
5,2,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...
6,2,0,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...
7,2,1,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...
8,3,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...
9,3,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...


In [106]:
check_title_association(train_df)

1046
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\251_300\0262_3.jpg
262 1 0
-------------
1018
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\251_300\0255_3.jpg
255 1 0
-------------
953
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\201_250\0239_2.jpg
239 0 1
-------------
266
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\51_100\0067_3.jpg
67 1 0
-------------
445
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\101_150\0112_2.jpg
112 0 1
-------------
47
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\1_50\0012_4.jpg
12 1 1
-------------
132
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\1_50\0034_1.jpg
34 0 0
-------------
928
D:\download\PD pro

In [107]:
train_df = train_df.merge(sex_df, on=['writer'], how='left')

# Display the updated dataframe
train_df.head(10)

Unnamed: 0,writer,same_text,isEng,train,file_name,male
0,1,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
1,1,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
2,1,0,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
3,1,1,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
4,2,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
5,2,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
6,2,0,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
7,2,1,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
8,3,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0
9,3,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0


In [108]:
train_df['index'] = train_df.index

# Display the updated dataframe
train_df.head(10)

Unnamed: 0,writer,same_text,isEng,train,file_name,male,index
0,1,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,0
1,1,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,1
2,1,0,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,2
3,1,1,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,3
4,2,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,4
5,2,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,5
6,2,0,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,6
7,2,1,1,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,7
8,3,0,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,8
9,3,1,0,1,D:\download\PD project\datasets\ICDAR 2013 - G...,0,9


In [109]:
check_sex_association(train_df,sex_df)

145
     writer  male
576     145     1
577     145     1
578     145     1
579     145     1
     writer  male
144     145     1
-------------
18
    writer  male
68      18     0
69      18     0
70      18     0
71      18     0
    writer  male
17      18     0
-------------
26
     writer  male
100      26     1
101      26     1
102      26     1
103      26     1
    writer  male
25      26     1
-------------
123
     writer  male
488     123     1
489     123     1
490     123     1
491     123     1
     writer  male
122     123     1
-------------
260
      writer  male
1036     260     1
1037     260     1
1038     260     1
1039     260     1
     writer  male
259     260     1
-------------
248
     writer  male
988     248     0
989     248     0
990     248     0
991     248     0
     writer  male
247     248     0
-------------
206
     writer  male
820     206     1
821     206     1
822     206     1
823     206     1
     writer  male
205     206     1
------------

In [110]:
check_title_association(train_df)

642
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\151_200\0161_3.jpg
161 1 0
-------------
369
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\51_100\0093_2.jpg
93 0 1
-------------
860
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\201_250\0216_1.jpg
216 0 0
-------------
418
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\101_150\0105_3.jpg
105 1 0
-------------
226
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\51_100\0057_3.jpg
57 1 0
-------------
29
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\1_50\0008_2.jpg
8 0 1
-------------
1055
D:\download\PD project\datasets\ICDAR 2013 - Gender Identification Competition Dataset\unzipped\251_300\0264_4.jpg
264 1 1
-------------
1097
D:\download\PD pr

In [111]:
male_counts(train_df)
check_if_both(train_df, column_name='same_text')
check_if_both(train_df, column_name='isEng') 
check_randomization(train_df)
check_grouping(train_df)
check_occurrences(train_df)


Number of times 'male' is 0: 572
Number of times 'male' is 1: 556
Number of times 'male' is something else: 0
All writers have both same_text=1 and same_text=0.
All writers have both isEng=1 and isEng=0.
Number of rows where train == 1: 1012
Fraction of rows where train == 1: 0.90
The train column is constant for all writers.
Each unique writer value occurs on exactly 4 rows.


In [112]:
import json

def get_base_metadata(filepath):
    stats = os.stat(filepath)
    return {
        "full_path": os.path.abspath(filepath),
        "size_bytes": stats.st_size,
        "created": datetime.fromtimestamp(stats.st_ctime).isoformat(),
        "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(),
        "accessed": datetime.fromtimestamp(stats.st_atime).isoformat()
    }

def load_log(path):
    if os.path.exists(path):
        with open(path, 'r') as f:
            return json.load(f)
    return {}

def save_log(data, path):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

def add_or_update_file(filepath, log_path, custom_metadata=None):
    """
    Adds or updates a file's metadata entry, including custom metadata.
    """
    if not os.path.isfile(filepath):
        print(f"File not found: {filepath}")
        return
    
    filename = os.path.basename(filepath)
    log = load_log(log_path)

    base_meta = get_base_metadata(filepath)
    entry = log.get(filename, {})

    # Combine existing metadata, new base, and new custom metadata
    entry.update(base_meta)
    if custom_metadata:
        entry.update(custom_metadata)

    log[filename] = entry
    save_log(log, log_path)
    print(f"Updated log for {filename}")

def read_metadata(filepath, log_path):
    """
    Adds or updates a file's metadata entry, including custom metadata.
    """
    if not os.path.isfile(filepath):
        print(f"File not found: {filepath}")
        return
    
    filename = os.path.basename(filepath)
    log = load_log(log_path)

    entry = log.get(filename, None)
    if entry:
        print(f"Metadata for {filename}:")
        for key, value in entry.items():
            print(f"{key}: {value}")
    else:
        print(f"No metadata found for {filename}")


In [114]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join(source_path, "outputs", "preprocessed_data")
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

output_file = os.path.join(output_dir, f"icdar_train_df_{timestamp}.csv")
train_df.to_csv(output_file, index=False)



print(f"Dataframe saved to {output_file}")

Dataframe saved to D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\icdar_train_df_20250514_165047.csv


In [115]:
# Example usage:
LOG_FILE = output_dir+"\\file_metadata_log.json"
add_or_update_file(
    output_file, LOG_FILE,
    custom_metadata={
        "seed": seed,
        "description": '''dataframe with the following columns: writer, language, same_text, isEng, train, filename, index; 
        Each row is one of the original dataset image files. This is the first version in which I have checked every operation done on the dataframe.''' 
    }
)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
read_metadata(
    output_file,
    log_path=LOG_FILE
)