In [None]:
import pandas as pd

# Read both sheets
df_photos = pd.read_excel('photo_repeat_group.xlsx', sheet_name='photo_repeat_group')
df_tree_numbers = pd.read_excel('photo_repeat_group.xlsx', sheet_name='tree number to urls match')

# Look at the first few rows of both dataframes to understand the structure
print("Tree numbers sheet first few rows:")
print(df_tree_numbers[['submission_id', 'tree_number']].head(10))

print("\
Photo repeat group sheet first few rows:")
print(df_photos[['parent_index', 'tree_url']].head(10))

In [None]:
# Get unique parent indices for each submission_id and tree_number combination
unique_indices = df_photos['parent_index'].unique()
print("\
Number of unique parent indices:", len(unique_indices))

# Create a mapping dictionary based on the sequence of appearance in the tree_numbers sheet
mapping = {}
current_parent_index_idx = 0

for _, row in df_tree_numbers.iterrows():
    submission_tree_key = (row['submission_id'], row['tree_number'])
    if submission_tree_key not in mapping and current_parent_index_idx < len(unique_indices):
        mapping[submission_tree_key] = unique_indices[current_parent_index_idx]
        current_parent_index_idx += 1

# Function to get URLs for a specific submission_id and tree_number
def get_urls_for_tree(submission_id, tree_number):
    parent_idx = mapping.get((submission_id, tree_number))
    if parent_idx is not None:
        urls = df_photos[df_photos['parent_index'] == parent_idx]['tree_url'].tolist()
        return ','.join(urls) if urls else None
    return None

# Create the tree_urls column
df_tree_numbers['tree_urls'] = df_tree_numbers.apply(
    lambda row: get_urls_for_tree(row['submission_id'], row['tree_number']), 
    axis=1
)

# Save the updated dataframe
output_filename = 'tree_numbers_with_sequential_mapping.xlsx'
df_tree_numbers.to_excel(output_filename, index=False)

# Print the first few mappings to verify
print("\
First 10 mappings:")
for i, ((submission_id, tree_number), parent_idx) in enumerate(list(mapping.items())[:10]):
    print(f"Submission {submission_id}, Tree {tree_number} -> Parent index {parent_idx}")

print("\
Sample of updated dataframe:")
print(df_tree_numbers.head())