This notebook takes the icdar training data and generates a csv file with writer,same_text,isEng,train,file_name,male columns (file_name is the absolute path)

In [1]:
#imports
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
import random

In [2]:
#test functions
def male_counts(sex_df):
    # Get the counts of each unique value in the "male" column
    male_counts = sex_df['male'].value_counts(dropna=False)

    # Print the counts
    print("Number of times 'male' is 0:", male_counts.get(0, 0))
    print("Number of times 'male' is 1:", male_counts.get(1, 0))
    print("Number of times 'male' is something else:", len(sex_df) - male_counts.get(0, 0) - male_counts.get(1, 0))
def check_if_both(train_df, column_name='same_text'):
    # Group by writer and check if both same_text=1 and same_text=0 are present
    writer_groups = train_df.groupby('writer')[column_name].nunique()

    # Filter writers that do not have both same_text=1 and same_text=0
    writers_missing_both = writer_groups[writer_groups != 2]

    if writers_missing_both.empty:
        print(f"All writers have both {column_name}=1 and {column_name}=0.")
    else:
        print(f"The following writers do not have {column_name}=1 and {column_name}=0")
        print(writers_missing_both)
def check_randomization(train_df):
    # Get the number of rows where train == 1
    train_1_count = train_df[train_df['train'] == 1].shape[0]

    # Calculate the fraction
    train_1_fraction = train_1_count / train_df.shape[0]

    print(f"Number of rows where train == 1: {train_1_count}")
    print(f"Fraction of rows where train == 1: {train_1_fraction:.2f}")
def check_grouping(train_df):
    # Group by writer and check if the train column has a constant value
    constant_train_check = train_df.groupby('writer')['train'].nunique()

    # Find writers where the train column is not constant
    non_constant_writers = constant_train_check[constant_train_check > 1]

    if non_constant_writers.empty:
        print("The train column is constant for all writers.")
    else:
        print("The train column is not constant for the following writers:")
        print(non_constant_writers)
def check_occurrences(train_df):
    # Count the occurrences of each unique writer value
    writer_counts = train_df['writer'].value_counts()

    # Check if all writers have exactly 4 occurrences
    if (writer_counts == 4).all():
        print("Each unique writer value occurs on exactly 4 rows.")
    else:
        print("Some writers do not occur exactly 4 times.")
        print(writer_counts[writer_counts != 4])
def check_title_association(train_df):
    random_numbers = random.sample(range(1, 282*4+1), 10)
    for n in random_numbers:
        print(n)
        print(train_df['file_name'][n])
        print(train_df['writer'][n],train_df['isEng'][n], train_df['same_text'][n])
        print('-------------')
def check_sex_association(train_df,sex_df):
    random_numbers = random.sample(range(1, 283), 10)
    for n in random_numbers:
        print(n)
        print(train_df[train_df['writer'] == n][['writer','male']])
        print(sex_df[sex_df['writer'] == n][['writer','male']])
        print('-------------')
def check_if_seed(train_df):
    train_0_writers = train_df[train_df['train'] == 0]['writer'].unique().tolist()
    train_1_writers = train_df[train_df['train'] == 1]['writer'].unique().tolist()
    return train_0_writers, train_1_writers


In [17]:
# Set the random seed for reproducibility
seed=42
np.random.seed(seed)

In [18]:
data_PATH="D:\\download\\PD project\\datasets\\ICDAR 2013 - Gender Identification Competition Dataset"
image_PATH=data_PATH+"\\unzipped"
source_path="D:\\burtm\\Visual_studio_code\\PD_related_projects"
train_df = pd.read_csv(os.path.join(data_PATH, "train\\train.csv"))

In [19]:
train_df.head(10)

Unnamed: 0,writer,page_id,language,same_text,tortuosityHist10[0],tortuosityHist10[1],tortuosityHist10[2],tortuosityHist10[3],tortuosityHist10[4],tortuosityHist10[5],...,directions_hist1a2a3a4a5a6a7a8a9a10_220[210],directions_hist1a2a3a4a5a6a7a8a9a10_220[211],directions_hist1a2a3a4a5a6a7a8a9a10_220[212],directions_hist1a2a3a4a5a6a7a8a9a10_220[213],directions_hist1a2a3a4a5a6a7a8a9a10_220[214],directions_hist1a2a3a4a5a6a7a8a9a10_220[215],directions_hist1a2a3a4a5a6a7a8a9a10_220[216],directions_hist1a2a3a4a5a6a7a8a9a10_220[217],directions_hist1a2a3a4a5a6a7a8a9a10_220[218],directions_hist1a2a3a4a5a6a7a8a9a10_220[219]
0,1,1,Arabic,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002457,0.002633,0.002698,0.002929,0.003014,0.003225,0.003572,0.003957,0.004232,0.004804
1,1,2,Arabic,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002498,0.002641,0.002836,0.002999,0.003246,0.003456,0.003709,0.003994,0.004308,0.004706
2,1,3,English,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.00209,0.00244,0.002831,0.003078,0.003438,0.003732,0.0041,0.004329,0.004687,0.004879
3,1,4,English,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002435,0.002823,0.003117,0.003441,0.003788,0.004056,0.004285,0.004527,0.004843,0.005085
4,2,1,Arabic,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002465,0.002724,0.003079,0.003305,0.003425,0.003403,0.00345,0.003626,0.003977,0.004547
5,2,2,Arabic,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002535,0.002808,0.002942,0.003151,0.003247,0.003246,0.003433,0.003516,0.003787,0.004377
6,2,3,English,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002887,0.003271,0.003569,0.003793,0.004049,0.004214,0.004339,0.004598,0.004874,0.005211
7,2,4,English,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002946,0.003295,0.003569,0.003846,0.004083,0.004329,0.004566,0.00485,0.005092,0.005397
8,3,1,Arabic,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.002245,0.002325,0.002383,0.002515,0.002725,0.003046,0.003304,0.003629,0.004037,0.004564
9,3,2,Arabic,1,0.92875,0.004071,0.001876,0.0,0.018031,0.025427,...,0.002179,0.002267,0.002385,0.00255,0.002723,0.002836,0.003049,0.003374,0.003757,0.004354


In [20]:
sex_df = pd.read_csv(os.path.join(data_PATH, "train_answers.csv"),delimiter=',')
sex_df.head(15)

Unnamed: 0,writer,male
0,1,0
1,2,0
2,3,0
3,4,0
4,5,1
5,6,1
6,7,1
7,8,1
8,9,0
9,10,1


In [21]:
# Convert the 'language' column to binary columns
language_dummies = pd.get_dummies(train_df['language'], drop_first=True)

# Add the binary column to the dataframe
train_df['isEng'] = language_dummies.iloc[:, 0].astype(int)

# Drop the original 'language' column
train_df.drop(columns=['language'], inplace=True)

# Display the updated dataframe
train_df.head(10)

Unnamed: 0,writer,page_id,same_text,tortuosityHist10[0],tortuosityHist10[1],tortuosityHist10[2],tortuosityHist10[3],tortuosityHist10[4],tortuosityHist10[5],tortuosityHist10[6],...,directions_hist1a2a3a4a5a6a7a8a9a10_220[211],directions_hist1a2a3a4a5a6a7a8a9a10_220[212],directions_hist1a2a3a4a5a6a7a8a9a10_220[213],directions_hist1a2a3a4a5a6a7a8a9a10_220[214],directions_hist1a2a3a4a5a6a7a8a9a10_220[215],directions_hist1a2a3a4a5a6a7a8a9a10_220[216],directions_hist1a2a3a4a5a6a7a8a9a10_220[217],directions_hist1a2a3a4a5a6a7a8a9a10_220[218],directions_hist1a2a3a4a5a6a7a8a9a10_220[219],isEng
0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002633,0.002698,0.002929,0.003014,0.003225,0.003572,0.003957,0.004232,0.004804,0
1,1,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002641,0.002836,0.002999,0.003246,0.003456,0.003709,0.003994,0.004308,0.004706,0
2,1,3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00244,0.002831,0.003078,0.003438,0.003732,0.0041,0.004329,0.004687,0.004879,1
3,1,4,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002823,0.003117,0.003441,0.003788,0.004056,0.004285,0.004527,0.004843,0.005085,1
4,2,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002724,0.003079,0.003305,0.003425,0.003403,0.00345,0.003626,0.003977,0.004547,0
5,2,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002808,0.002942,0.003151,0.003247,0.003246,0.003433,0.003516,0.003787,0.004377,0
6,2,3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003271,0.003569,0.003793,0.004049,0.004214,0.004339,0.004598,0.004874,0.005211,1
7,2,4,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003295,0.003569,0.003846,0.004083,0.004329,0.004566,0.00485,0.005092,0.005397,1
8,3,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002325,0.002383,0.002515,0.002725,0.003046,0.003304,0.003629,0.004037,0.004564,0
9,3,2,1,0.92875,0.004071,0.001876,0.0,0.018031,0.025427,0.003687,...,0.002267,0.002385,0.00255,0.002723,0.002836,0.003049,0.003374,0.003757,0.004354,0


In [22]:
# Set the probability of being 0
p_train = 0.9
N=282

# Create a dataframe with writer column from 1 to 282
writers_df = pd.DataFrame({'writer': np.arange(1, N+1)})

# Add a train column that is randomly 0 or 1 with probability p of being 0
writers_df['train'] = np.random.choice([0, 1], size=len(writers_df), p=[1-p_train, p_train])

# Merge with the train_df dataframe on the writer column
train_df = train_df.merge(writers_df, on='writer', how='left')

# Display the dataframe
train_df.head()

Unnamed: 0,writer,page_id,same_text,tortuosityHist10[0],tortuosityHist10[1],tortuosityHist10[2],tortuosityHist10[3],tortuosityHist10[4],tortuosityHist10[5],tortuosityHist10[6],...,directions_hist1a2a3a4a5a6a7a8a9a10_220[212],directions_hist1a2a3a4a5a6a7a8a9a10_220[213],directions_hist1a2a3a4a5a6a7a8a9a10_220[214],directions_hist1a2a3a4a5a6a7a8a9a10_220[215],directions_hist1a2a3a4a5a6a7a8a9a10_220[216],directions_hist1a2a3a4a5a6a7a8a9a10_220[217],directions_hist1a2a3a4a5a6a7a8a9a10_220[218],directions_hist1a2a3a4a5a6a7a8a9a10_220[219],isEng,train
0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002698,0.002929,0.003014,0.003225,0.003572,0.003957,0.004232,0.004804,0,1
1,1,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002836,0.002999,0.003246,0.003456,0.003709,0.003994,0.004308,0.004706,0,1
2,1,3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002831,0.003078,0.003438,0.003732,0.0041,0.004329,0.004687,0.004879,1,1
3,1,4,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003117,0.003441,0.003788,0.004056,0.004285,0.004527,0.004843,0.005085,1,1
4,2,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003079,0.003305,0.003425,0.003403,0.00345,0.003626,0.003977,0.004547,0,1


In [23]:
val,train = check_if_seed(train_df)
print(val)
print('val should be (previous run): \n [7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260])')

[7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260]
val should be (previous run): 
 [7, 11, 30, 33, 38, 43, 57, 59, 69, 73, 78, 84, 99, 101, 110, 129, 146, 149, 165, 169, 172, 191, 202, 206, 209, 223, 238, 245, 260])


In [24]:
train_df = train_df.merge(sex_df, on=['writer'], how='left')

# Display the updated dataframe
train_df.head(10)

Unnamed: 0,writer,page_id,same_text,tortuosityHist10[0],tortuosityHist10[1],tortuosityHist10[2],tortuosityHist10[3],tortuosityHist10[4],tortuosityHist10[5],tortuosityHist10[6],...,directions_hist1a2a3a4a5a6a7a8a9a10_220[213],directions_hist1a2a3a4a5a6a7a8a9a10_220[214],directions_hist1a2a3a4a5a6a7a8a9a10_220[215],directions_hist1a2a3a4a5a6a7a8a9a10_220[216],directions_hist1a2a3a4a5a6a7a8a9a10_220[217],directions_hist1a2a3a4a5a6a7a8a9a10_220[218],directions_hist1a2a3a4a5a6a7a8a9a10_220[219],isEng,train,male
0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002929,0.003014,0.003225,0.003572,0.003957,0.004232,0.004804,0,1,0
1,1,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002999,0.003246,0.003456,0.003709,0.003994,0.004308,0.004706,0,1,0
2,1,3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003078,0.003438,0.003732,0.0041,0.004329,0.004687,0.004879,1,1,0
3,1,4,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003441,0.003788,0.004056,0.004285,0.004527,0.004843,0.005085,1,1,0
4,2,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003305,0.003425,0.003403,0.00345,0.003626,0.003977,0.004547,0,1,0
5,2,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003151,0.003247,0.003246,0.003433,0.003516,0.003787,0.004377,0,1,0
6,2,3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003793,0.004049,0.004214,0.004339,0.004598,0.004874,0.005211,1,1,0
7,2,4,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003846,0.004083,0.004329,0.004566,0.00485,0.005092,0.005397,1,1,0
8,3,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002515,0.002725,0.003046,0.003304,0.003629,0.004037,0.004564,0,1,0
9,3,2,1,0.92875,0.004071,0.001876,0.0,0.018031,0.025427,0.003687,...,0.00255,0.002723,0.002836,0.003049,0.003374,0.003757,0.004354,0,1,0


In [25]:
train_df['index'] = train_df.index

# Display the updated dataframe
train_df.head(10)

Unnamed: 0,writer,page_id,same_text,tortuosityHist10[0],tortuosityHist10[1],tortuosityHist10[2],tortuosityHist10[3],tortuosityHist10[4],tortuosityHist10[5],tortuosityHist10[6],...,directions_hist1a2a3a4a5a6a7a8a9a10_220[214],directions_hist1a2a3a4a5a6a7a8a9a10_220[215],directions_hist1a2a3a4a5a6a7a8a9a10_220[216],directions_hist1a2a3a4a5a6a7a8a9a10_220[217],directions_hist1a2a3a4a5a6a7a8a9a10_220[218],directions_hist1a2a3a4a5a6a7a8a9a10_220[219],isEng,train,male,index
0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003014,0.003225,0.003572,0.003957,0.004232,0.004804,0,1,0,0
1,1,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003246,0.003456,0.003709,0.003994,0.004308,0.004706,0,1,0,1
2,1,3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003438,0.003732,0.0041,0.004329,0.004687,0.004879,1,1,0,2
3,1,4,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003788,0.004056,0.004285,0.004527,0.004843,0.005085,1,1,0,3
4,2,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003425,0.003403,0.00345,0.003626,0.003977,0.004547,0,1,0,4
5,2,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003247,0.003246,0.003433,0.003516,0.003787,0.004377,0,1,0,5
6,2,3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004049,0.004214,0.004339,0.004598,0.004874,0.005211,1,1,0,6
7,2,4,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004083,0.004329,0.004566,0.00485,0.005092,0.005397,1,1,0,7
8,3,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002725,0.003046,0.003304,0.003629,0.004037,0.004564,0,1,0,8
9,3,2,1,0.92875,0.004071,0.001876,0.0,0.018031,0.025427,0.003687,...,0.002723,0.002836,0.003049,0.003374,0.003757,0.004354,0,1,0,9


In [26]:
check_sex_association(train_df,sex_df)

255
      writer  male
1016     255     0
1017     255     0
1018     255     0
1019     255     0
     writer  male
254     255     0
-------------
137
     writer  male
544     137     1
545     137     1
546     137     1
547     137     1
     writer  male
136     137     1
-------------
91
     writer  male
360      91     1
361      91     1
362      91     1
363      91     1
    writer  male
90      91     1
-------------
253
      writer  male
1008     253     0
1009     253     0
1010     253     0
1011     253     0
     writer  male
252     253     0
-------------
154
     writer  male
612     154     1
613     154     1
614     154     1
615     154     1
     writer  male
153     154     1
-------------
187
     writer  male
744     187     1
745     187     1
746     187     1
747     187     1
     writer  male
186     187     1
-------------
206
     writer  male
820     206     1
821     206     1
822     206     1
823     206     1
     writer  male
205     206     1

In [27]:
male_counts(train_df)
check_if_both(train_df, column_name='same_text')
check_if_both(train_df, column_name='isEng') 
check_randomization(train_df)
check_grouping(train_df)
check_occurrences(train_df)


Number of times 'male' is 0: 572
Number of times 'male' is 1: 556
Number of times 'male' is something else: 0
All writers have both same_text=1 and same_text=0.
All writers have both isEng=1 and isEng=0.
Number of rows where train == 1: 1012
Fraction of rows where train == 1: 0.90
The train column is constant for all writers.
Each unique writer value occurs on exactly 4 rows.


In [28]:
import json

def get_base_metadata(filepath):
    stats = os.stat(filepath)
    return {
        "full_path": os.path.abspath(filepath),
        "size_bytes": stats.st_size,
        "created": datetime.fromtimestamp(stats.st_ctime).isoformat(),
        "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(),
        "accessed": datetime.fromtimestamp(stats.st_atime).isoformat()
    }

def load_log(path):
    if os.path.exists(path):
        with open(path, 'r') as f:
            return json.load(f)
    return {}

def save_log(data, path):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

def add_or_update_file(filepath, log_path, custom_metadata=None):
    """
    Adds or updates a file's metadata entry, including custom metadata.
    """
    if not os.path.isfile(filepath):
        print(f"File not found: {filepath}")
        return
    
    filename = os.path.basename(filepath)
    log = load_log(log_path)

    base_meta = get_base_metadata(filepath)
    entry = log.get(filename, {})

    # Combine existing metadata, new base, and new custom metadata
    entry.update(base_meta)
    if custom_metadata:
        entry.update(custom_metadata)

    log[filename] = entry
    save_log(log, log_path)
    print(f"Updated log for {filename}")

def read_metadata(filepath, log_path):
    """
    Adds or updates a file's metadata entry, including custom metadata.
    """
    if not os.path.isfile(filepath):
        print(f"File not found: {filepath}")
        return
    
    filename = os.path.basename(filepath)
    log = load_log(log_path)

    entry = log.get(filename, None)
    if entry:
        print(f"Metadata for {filename}:")
        for key, value in entry.items():
            print(f"{key}: {value}")
    else:
        print(f"No metadata found for {filename}")


In [29]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join(source_path, "outputs", "preprocessed_data")
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

output_file = os.path.join(output_dir, f"icdar_train_df_KAGGLE_{timestamp}.csv")
train_df.to_csv(output_file, index=False)



print(f"Dataframe saved to {output_file}")

Dataframe saved to D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\icdar_train_df_KAGGLE_20250514_181737.csv


In [30]:
# Example usage:
LOG_FILE = output_dir+"\\file_metadata_log.json"
print(f"Log file path: {LOG_FILE}")
print(f"Output file path: {output_file}")
add_or_update_file(
    output_file, LOG_FILE,
    custom_metadata={
        "seed": seed,
        "description": '''dataframe with the following columns: writer, language, same_text, isEng, train, index and kaggle features; 
        This can be used to train FE models''' 
    }
)

Log file path: D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\file_metadata_log.json
Output file path: D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\icdar_train_df_KAGGLE_20250514_181737.csv
Updated log for icdar_train_df_KAGGLE_20250514_181737.csv


In [31]:
read_metadata(
    output_file,
    log_path=LOG_FILE
)

Metadata for icdar_train_df_KAGGLE_20250514_181737.csv:
full_path: D:\burtm\Visual_studio_code\PD_related_projects\outputs\preprocessed_data\icdar_train_df_KAGGLE_20250514_181737.csv
size_bytes: 53948612
created: 2025-05-14T18:17:37.961378
modified: 2025-05-14T18:17:45.372099
accessed: 2025-05-14T18:17:45.372099
seed: 42
description: dataframe with the following columns: writer, language, same_text, isEng, train, index and kaggle features; 
        This can be used to train FE models
