### Notebook 3.0: Network Extraction(`03_0_network_extraction.ipynb`)

- [x] extract parent-child relationships


In [1]:
# import
import sqlite3
import pandas as pd
import numpy as np
import regex as re
import networkx as nx
from tqdm import tqdm
import igraph as ig
import infomap
import time

In [2]:
# import database table
conn = sqlite3.connect('daylilies.db')
df = pd.read_sql_query("SELECT * FROM daylilies", conn)
conn.close()

# Display basic information about the dataframe
print("DataFrame Shape:", df.shape)
display(df.head())

DataFrame Shape: (101406, 21)


Unnamed: 0,url,name,hybridizer,year,scape_height,bloom_size,bloom_season,ploidy,foliage_type,fragrance,...,bud_count,branches,seedling_num,color_description,parentage,image_url,form,sculpting,notes,rebloom
0,https://daylilydatabase.org/detail.php?id=1959...,Fairest Gwendol,Kirchhoff-D.,2022,28.0,5.5,Early,Tetraploid,Evergreen,Very Fragrant,...,14.0,3.0,08-38Pi,"Clear, coral pink with deeply beveled midribs ...",(sdlg × (sdlg x Clark Gable)),https://daylilydatabase.org/AHSPhoto/F/Fairest...,Double 99%,Pleated,,1
1,https://daylilydatabase.org/detail.php?id=2014...,Margaret Gellatly Memorial,Gellatly,2024,28.0,7.0,Extra Early,Diploid,Dormant,Fragrant,...,20.0,4.0,TALxYPS01,"Yellow with red stippled, broken overlay, recu...",(Tequila and Lime × Striped Chameleon),https://daylilydatabase.org/AHSPhoto/M/Margare...,Unusual FormCrispate,Cristated,,1
2,https://daylilydatabase.org/detail.php?id=1982...,Parallax Effect,Turk-Harmon,2023,18.0,6.5,Midseason,Tetraploid,Semi-Evergreen,Fragrant,...,10.0,4.0,19-K1-iv,"Lilac purple with faded lilac sepals, yellow t...",(Aqua Tech × Neutron Star),https://daylilydatabase.org/AHSPhoto/P/Paralla...,Unusual FormCrispate,Pleated,,1
3,https://daylilydatabase.org/detail.php?id=1788...,L'Excentrique du Lac,Niquet,2015,38.0,8.0,Midseason,Tetraploid,Semi-Evergreen,Fragrant,...,12.0,2.0,11-25 A,"red self, twisted sepals, lime to green throat",(Rolling Raven × La Vie en Rose du Lac),https://daylilydatabase.org/AHSPhoto/L/lexcent...,Unusual FormCrispate,Pleated,,0
4,https://daylilydatabase.org/detail.php?id=1821...,Papa,Rocheleau-J.,2016,30.0,6.0,Midseason,Tetraploid,Semi-Evergreen,Fragrant,...,20.0,3.0,10201,"red burgundy with darker burgundy eye, white e...",(Thunder and Lightning × Grand Cru),https://daylilydatabase.org/AHSPhoto/P/papa.jpg,Unusual FormCrispate,Pleated,,0


### Extract Parent Child Relationships

Using pattern matching, extract known parent names from parentage field

In [None]:
def clean_variety_name(name):
    """
    Clean variety names by removing extra whitespace and quotes.
    
    Args:
        name: String variety name to clean, or NaN/None value
    
    Returns:
        str: Cleaned variety name with normalized whitespace and quotes removed,
             or empty string if input is None/NaN
    """
    if not name or pd.isna(name):
        return ""
    return re.sub(r'\s+', ' ', name).strip().strip("'")

def extract_variety_matches(parentage, pattern):
    """
    Extract variety matches with their positions from parentage string.
    
    Args:
        parentage: String containing parentage information to search
        pattern: Compiled regex pattern object to match against
    
    Returns:
        list: List of tuples (match_text, start_pos, end_pos) for each match found,
              or None if parentage is not a string
    """
    if isinstance(parentage, str):
        matches = pattern.finditer(parentage)
        return [(match.group(), match.start(), match.end()) for match in matches]
    return []

def process_parentage_relationships(df, chunk_size=3000):
    """
    Process daylily parentage relationships to extract parent-child variety connections.
    
    Args:
        df: pandas DataFrame with 'name' and 'parentage' columns
        chunk_size: Integer size for processing variety names in chunks (default 3000)
    
    Returns:
        pandas.DataFrame: DataFrame with 'source' (parent) and 'target' (child) columns
                         representing unique parent-child relationships found in parentage data
    """
    start_time = time.time()
    
    # set of cleaned variety names
    variety_set = set(df['name'].apply(clean_variety_name))
    print(f"Found {len(variety_set)} unique variety names")
    
    # chunks
    variety_chunks = [list(variety_set)[i:i + chunk_size] 
                     for i in range(0, len(variety_set), chunk_size)]
    print(f"Processing {len(variety_chunks)} chunks of size {chunk_size}")
    
    # regex patterns for each chunk
    patterns = [re.compile(r'\b(?:' + '|'.join(map(re.escape, chunk)) + r')\b', 
                         flags=re.IGNORECASE)
               for chunk in variety_chunks]
    
    # results DataFrame
    all_matches = pd.DataFrame(columns=['source_variety', 'matched_variety', 'start', 'end'])
    matches_found = 0
    
    # tqdm progress bar for each row
    print("\nExtracting parent-child relationships...")
    for _, row in tqdm(df.iterrows(), total=len(df), 
                      desc="Processing varieties", 
                      unit="varieties"):
        if pd.isna(row['parentage']):
            continue
            
        child_name = clean_variety_name(row['name'])
        current_matches = []
        
        # check each pattern chunk for matches
        for pattern in patterns:
            matches = extract_variety_matches(row['parentage'], pattern)
            for match in matches:
                if match[0] != child_name:  # Avoid self-references
                    current_matches.append({
                        'source_variety': child_name,
                        'matched_variety': match[0],
                        'start': match[1],
                        'end': match[2]
                    })
        
        # check for non-overlapping matches
        if current_matches:
            matches_df = pd.DataFrame(current_matches)
            matches_df['length'] = matches_df['end'] - matches_df['start']
            
            # Sort by length and position
            sorted_matches = matches_df.sort_values(
                by=['length', 'start'], 
                ascending=[False, True]
            )
            
            # only keep non-overlapping matches
            non_overlapping = []
            for _, match in sorted_matches.iterrows():
                if all(match['start'] >= m['end'] or match['end'] <= m['start'] 
                      for m in non_overlapping):
                    non_overlapping.append({
                        'start': match['start'],
                        'end': match['end']
                    })
                    # add to final results
                    all_matches = pd.concat([
                        all_matches,
                        pd.DataFrame([{
                            'source_variety': match['source_variety'],
                            'matched_variety': match['matched_variety'],
                            'start': match['start'],
                            'end': match['end']
                        }])
                    ], ignore_index=True)
                    matches_found += 1
    
    # make final parent-child relationships df
    edges = pd.DataFrame({
        'source': all_matches['matched_variety'],  # parent
        'target': all_matches['source_variety']    # child
    }).drop_duplicates()
    
    total_time = time.time() - start_time
    print(f"\nProcessing completed in {total_time:.2f} seconds")
    print(f"Total matches found: {matches_found}")
    print(f"Unique relationships: {len(edges)}")
    
    return edges

if __name__ == "__main__":
    # initialiazing function call 
    source_target_df = process_parentage_relationships(df)
    
    # network statistics
    print("\nNetwork Statistics:")
    print(f"Total relationships: {len(source_target_df)}")
    print(f"Unique parents: {source_target_df['source'].nunique()}")
    print(f"Unique children: {source_target_df['target'].nunique()}")


Found 101406 unique variety names
Processing 34 chunks of size 3000

Extracting parent-child relationships...


Processing varieties: 100%|██████████| 101406/101406 [13:59<00:00, 120.76varieties/s]



Processing completed in 844.79 seconds
Total matches found: 140234
Unique relationships: 138352

Network Statistics:
Total relationships: 138352
Unique parents: 25526
Unique children: 69592


In [4]:
# check output from extraction
print("shape:", source_target_df.shape)

display(source_target_df.head(50))

# top parents
top_parents = source_target_df['source'].value_counts().head(10)
print("Top Parents:")
for parent, count in top_parents.items():
    print(f"{parent}: {count} children")

shape: (138352, 2)


Unnamed: 0,source,target
0,Clark Gable,Fairest Gwendol
1,Striped Chameleon,Margaret Gellatly Memorial
2,Tequila and Lime,Margaret Gellatly Memorial
3,Neutron Star,Parallax Effect
4,Aqua Tech,Parallax Effect
5,La Vie en Rose du Lac,L'Excentrique du Lac
6,Rolling Raven,L'Excentrique du Lac
7,Thunder and Lightning,Papa
8,Grand Cru,Papa
9,Bill Norris,My Brother Eddie


Top Parents:
Ed Brown: 488 children
Ida's Magic: 454 children
Rose F. Kennedy: 420 children
Lavender Blue Baby: 373 children
J.T. Davis: 359 children
Wedding Band: 353 children
Strawberry Candy: 340 children
Barbara Mitchell: 322 children
Admiral's Braid: 322 children
Dance Ballerina Dance: 312 children


In [None]:
# Save source-target df
output_path = 'data/daylily_parent_child_network.csv'


# save to CSV with no index column added
source_target_df.to_csv(output_path, index=False)
print(f"Saved network relationships to: {output_path}")
print(f"Number of relationships saved: {len(source_target_df)}")

Saved network relationships to: data/daylily_parent_child_network.csv
Number of relationships saved: 138352


In [6]:

# connect to db
conn = sqlite3.connect('daylilies.db')

# save df to new table
table_name = 'parent_child_relationships'

try:
    # create & save table
    source_target_df.to_sql(table_name, conn, if_exists='replace', index=False)
    
    print(f"saved to table: {table_name}")

except Exception as e:
    print(f"Error saving to database: {e}")

finally:
    # close connection
    conn.close()

saved to table: parent_child_relationships
