# Description:

This Jupyter notebook cleans and converts multivalued place fields into FAST format with faceting:
 - Uses an input CSV with hierarchical place name values split into multiple columns for Country, State, City, Region
 - Splits the pipe-delimited place strings into lists.
 - Combines into FAST composite terms
 - For rows with values under States, uses the format (State--City, State--Region) 
 - For rows without States, uses country-centric composites (Country--City, Country--Region)
 - for rows lacking Country or State but with Regions, includes Region alone.
 - Appends standalone facets in order: State (if any) then Country (if any); 
 - Replaces 'St.' abbreviations with 'Saint' in terms.
 - Deduplicates while preserving order and outputs pipe-delimited FAST string.



In [None]:
# Cell 1: Imports and helper functions
import os
import pandas as pd

def parse_multivalued(val):
    """
    Split a pipe-delimited string into a list, handling NaNs.
    """
    if pd.isna(val) or not str(val).strip():
        return []
    return [v.strip() for v in str(val).split('|') if v.strip()]

def generate_fast_terms(places_dict):
    """
    Generate base FAST terms based on presence of states or countries.
    """
    fast_terms = []
    countries = places_dict.get('Country', [])
    states = places_dict.get('State', [])
    cities = places_dict.get('City', [])
    regions = places_dict.get('Region', [])

    # State-centric composites if states exist
    if states:
        for state in states:
            # State only (if no cities or regions)
            if not cities and not regions:
                fast_terms.append(state)
            # State--City composites
            for city in cities:
                fast_terms.append(f"{state}--{city}")
            # State--Region composites
            for region in regions:
                fast_terms.append(f"{state}--{region}")
    else:
        # Country-centric composites when no states
        for country in countries:
            # Country only
            if not cities and not regions:
                fast_terms.append(country)
            # Country--City composites
            for city in cities:
                fast_terms.append(f"{country}--{city}")
            # Country--Region composites
            for region in regions:
                fast_terms.append(f"{country}--{region}")

    return fast_terms




In [None]:
# Cell 2: File paths
# Modify data_dir to your actual directory
data_dir = '.'
input_csv = os.path.join(data_dir, 'umedia.csv')
output_csv = os.path.join(data_dir, 'places_fast.csv')



In [None]:
# Cell 3: Load the CSV
df = pd.read_csv(input_csv)




In [None]:
# Cell 4: Identify facet columns
facet_cols = {
    'Country': [col for col in df.columns if col.startswith('Country')],
    'State':   [col for col in df.columns if col.startswith('State')],
    'City':    [col for col in df.columns if col.startswith('City')],
    'Region':  [col for col in df.columns if col.startswith('Region')],
}



In [None]:
# Cell 5: Parse, generate FAST, insert facets, and handle 'St.' replacement
records = []
for idx, row in df.iterrows():
    places = {}
    for facet, cols in facet_cols.items():
        vals = []
        for col in cols:
            vals.extend(parse_multivalued(row.get(col, '')))
        # dedupe
        places[facet] = list(dict.fromkeys(vals))

    # Generate base FAST terms
    base_terms = generate_fast_terms(places)

    # Build the final list, starting with composites
    final_terms = list(base_terms)

    # Always add State facet alone (if exists)
    for state in places.get('State', []):
        final_terms.append(state)

    # Always add Country facet alone (if exists)
    for country in places.get('Country', []):
        final_terms.append(country)

    # If no country and no state but regions exist, add Region alone
    if not places.get('Country') and not places.get('State'):
        for region in places.get('Region', []):
            final_terms.append(region)

    # Deduplicate while preserving order
    final_terms = list(dict.fromkeys(final_terms))

    # Replace 'St.' with 'Saint' in every term
    final_terms = [term.replace('St.', 'Saint') for term in final_terms]

    records.append({
        'ID': row['ID'],
        'FAST': '|'.join(final_terms)
    })

fast_df = pd.DataFrame(records)
# fast_df.head()



In [None]:
# Cell 6: Save output CSV
fast_df.to_csv(output_csv, index=False)
print(f'FAST terms with facets written to {output_csv}')