In [61]:
import pandas as pd
import glob
pd.set_option('display.max_colwidth', None)

### Read raw data

In [64]:
df = pd.read_csv('data.csv')

In [66]:
# Append the image directory to the filenames
prefix_to_add = 'images_clipped_buffered/'
df['file_name'] = prefix_to_add + df['file_name']

# Find rows where images correlate with more than one bounding box
duplicate_rows = df[df.duplicated(subset=['file_name'], keep=False)]

# Create a separate dataframe from those rows
duplicate_df = df[df.duplicated(subset=['file_name'], keep=False)].copy()

# Remove those rows from the original dataframe
df_filtered = df.copy()
df_filtered.drop_duplicates(subset=['file_name'], keep=False, inplace=True)

# Print the two dataframes
print("Original DataFrame:")
print(len(df_filtered))
print("\nDuplicate DataFrame:")
print(len(duplicate_df))

In [68]:
# Get the augmented images and path info
extras_completeness_prefix = '~/data/images_clipped_buffered_augmented_completeness/'
completeness_prefix = 'images_clipped_buffered_augmented_completeness/'
extras_completeness = glob.glob(f"{extras_completeness_prefix}**/**/*.jpg")
extras_condition_prefix = '~/data/images_clipped_buffered_augmented_condition/'
condition_prefix = 'images_clipped_buffered_augmented_condition/'
extras_condition = glob.glob(f"{extras_condition_prefix}**/**/*.jpg")
extras_material_prefix = '~/data/images_clipped_buffered_augmented_material/'
material_prefix = 'images_clipped_buffered_augmented_material/'
extras_material = glob.glob(f"{extras_material_prefix}**/**/*.jpg")
extras_use_prefix = '~/data/images_clipped_buffered_augmented_use/'
use_prefix = 'images_clipped_buffered_augmented_use/'
extras_use = glob.glob(f"{extras_use_prefix}**/**/*.jpg")
extras_security_prefix = '~/data/images_clipped_buffered_augmented_security/'
security_prefix = 'images_clipped_buffered_augmented_security/'
extras_security = glob.glob(f"{extras_security_prefix}**/**/*.jpg")


#### Add the augmented images
Add new rows to dataframes where augmented images exist, basing all fields on their original un-augmented image counterpart.

In [None]:
# Repeat this for each class
for e in extras_security:
    et = e.split('/')[-3:]
    eb = '/'.join(e.split('/')[-3:])
    print(f"{prefix_to_add}{eb[:-6] + '.jpg'}")
    index_found = df_filtered[df_filtered['file_name'] == f"{prefix_to_add}{eb[:-6] + '.jpg'}"].index
     
    if not index_found.empty:
        existing_row_index = index_found[0]
        existing_row = df_filtered.loc[existing_row_index]
        new_row = df_filtered.loc[existing_row_index].copy()

        new_row['file_name'] = f"{security_prefix}{eb}"

        df_filtered = pd.concat([df_filtered, pd.DataFrame([new_row])], ignore_index=True)

    else:
        print("No matching rows found for", f"{prefix_to_add}{eb[:-6] + '.jpg'}")

#### Weight the rows by label

In [70]:
# Create new column, setting all to 1 to start
df["weights"] = 1

Selectively up-weight the classes in need of more sampling

In [None]:
# Value to compare
target_status = 'secured'

# Find rows where 'Status' column equals the target value
filtered_rows = df[df['security'] == target_status]

# Assign a new value to another column ('NewColumn') for those rows
df.loc[filtered_rows.index, 'weights'] = 1 # Or 2,3, etc depending on relative sparsity

print(df['weights'].unique())

In [73]:
df.to_csv('data_security_augmented.csv')