# Site Scaffold Generator

This script generates site and data directories from templates, replacing placeholder tags
with values from a CSV file.

Required Setup:
-------------
1. Create a 'sites.csv' file in the same directory as this notebook with the following columns:
   - YEAR: The year of the event (e.g., 2025)
   - NAME: URL-friendly name of the event (e.g., ucla, and NOT "SICSS-ucla")
   - LOCATION: Physical location of the event (e.g., Berkeley California)
   - START_DATE: Event start date in format "Month Day" (e.g., "August 1")
   - END_DATE: Event end date in format "Month Day" (e.g., "August 15")
   - HOST: hosting institution / department (e.g., The UCLA Social Media Research Lab)

Order of the columns should not matter. Populate with data from form sent to organizers, or with manually inputted data. This should still save you a bit of time!

Note: if you are creating the first site(s) for a given year, you will have to go to the _config.yml file in the base directory of the repo, and update the "current_year" parameter so that the locations appear in the locations page and have a tab.

Note 2: if a site has adjusted their naming from year to year, but you still want it to appear as a continuation of that site here, you may need to adjust the name you have in your csv file to match the old one, or go back and rename folders / change the md and yml where necessary if you notice later.

Example CSV row:
2025,ucla,"Berkeley, California",August 1,August 15

The script will:
- Create year folders if they don't exist
- Copy templates from .20XX_template/[[NAME]]/ to /YEAR/NAME/
- Copy templates from /_data/.template/[[NAME]]/ to /_data/YEAR/NAME/
- Replace all [[TAG]] patterns in .md and .yml files with corresponding CSV values
- Attempt to find and apply a header / location browse image from previous years of the same location

Usage:
    run the below cells.

In [41]:
import pandas as pd
from pathlib import Path
import shutil
import re

In [55]:
def clean_path_name(name):
    accent_map = {
        'ü': 'u', 'ä': 'a', 'ö': 'o',
        'é': 'e', 'è': 'e', 'ê': 'e', 'ë': 'e',
        'á': 'a', 'à': 'a', 'â': 'a', 'ã': 'a',
        'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
        'ó': 'o', 'ò': 'o', 'ô': 'o', 'õ': 'o',
        'ú': 'u', 'ù': 'u', 'û': 'u',
        'ý': 'y', 'ÿ': 'y',
        'ñ': 'n', 'ç': 'c'
    }
    
    path_name = str(name).lower()
    for accent, replacement in accent_map.items():
        path_name = path_name.replace(accent, replacement)
    return re.sub(r'[^a-z0-9]', '-', path_name)

def find_most_recent_image(root_dir, current_year, location_name):
    """Find the most recent header image for a given location from previous years."""
    # Convert current_year to int for comparison
    current_year = int(current_year)
    
    # Check all previous years in descending order
    for year in range(current_year - 1, 2016, -1):  # 2016 being the earliest SICSS year
        # Check both possible image locations
        data_image = root_dir / '_data' / str(year) / location_name / 'location.yml'
        site_image = root_dir / str(year) / location_name / 'index.md'
        
        # Check data image first
        if data_image.exists():
            with open(data_image, 'r', encoding='utf-8') as f:
                content = f.read()
                image_match = re.search(r'^image:\s*(.+)$', content, re.MULTILINE)
                if image_match and 'tbd.jpg' not in image_match.group(1):
                    return image_match.group(1).strip()
        
        # Then check site image
        if site_image.exists():
            with open(site_image, 'r', encoding='utf-8') as f:
                content = f.read()
                image_match = re.search(r'^image:\s*(.+)$', content, re.MULTILINE)
                if image_match and 'tbd.jpg' not in image_match.group(1):
                    return image_match.group(1).strip()
    
    # Return default if no previous image found
    return '/assets/images/tbd.jpg'

def replace_tags_in_file(file_path, replacements):
    # Get root directory
    root_dir = Path.cwd().parent
    
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Create path-friendly name for links and directories
    path_name = path_name = clean_path_name(replacements['NAME'])
    
    # Find previous year's image if it exists
    previous_image = find_most_recent_image(root_dir, replacements['YEAR'], path_name)
    
    # Special handling for YAML files and YAML front matter
    is_yaml = file_path.suffix == '.yml'
    
    if is_yaml:
        # Handle location.yml specially to update the image
        if file_path.name == 'location.yml':
            content = re.sub(r'^image:\s*.*$', f'image: {previous_image}', 
                           content, flags=re.MULTILINE)
            
        # Handle hero.yml specially
        if file_path.name == 'hero.yml':
            # Use original NAME value for location field
            content = re.sub(r'^location:\s*.*$', f'location: SICSS-{replacements["NAME"]}', 
                           content, flags=re.MULTILINE)
            
            # Handle other replacements normally
            for tag, value in replacements.items():
                if tag != 'NAME':  # Skip NAME as we handled it specially
                    pattern = f'\\[\\[{tag}\\]\\]'
                    content = re.sub(pattern, str(value), content, flags=re.IGNORECASE)
        else:
            # For other YAML files, use path_name only for 'link:' field
            content = re.sub(r'^link:\s*.*$', f'link: {path_name}', content, flags=re.MULTILINE)
            
            # Handle all replacements in YAML files
            for tag, value in replacements.items():
                pattern = f'\\[\\[{tag}\\]\\]'
                
                # Use original NAME value for title field
                if tag == 'NAME':
                    content = re.sub(r'^title:\s*.*$', f'title: {value}', content, flags=re.MULTILINE)
                    # Use path_name for other NAME instances (except title)
                    content = re.sub(pattern, path_name, content, flags=re.IGNORECASE)
                else:
                    content = re.sub(pattern, str(value), content, flags=re.IGNORECASE)
    else:
        # Update image in index.md
        if file_path.name == 'index.md':
            content = re.sub(r'^image:\s*.*$', f'image: {previous_image}', 
                           content, flags=re.MULTILINE)
        
        # Handle YAML front matter in markdown files
        yaml_pattern = r'^---\n.*?---'
        def yaml_replacer(match):
            yaml_content = match.group(0)
            # Use original NAME value for title field in front matter
            yaml_content = re.sub(r'^title:\s*.*$', f'title: {replacements["NAME"]}', 
                                yaml_content, flags=re.MULTILINE)
            # Use path_name only for partner_site field
            yaml_content = re.sub(r'^partner_site:\s*.*$', f'partner_site: {path_name}', 
                                yaml_content, flags=re.MULTILINE)
            # Remove any remaining [[NAME]] tags in front matter
            yaml_content = re.sub(r'\[\[NAME\]\]', path_name, yaml_content, flags=re.IGNORECASE)
            return yaml_content
        
        content = re.sub(yaml_pattern, yaml_replacer, content, flags=re.DOTALL)
        
        # Handle the rest of the content - use original NAME value
        for tag, value in replacements.items():
            pattern = f'\\[\\[{tag}\\]\\]'
            content = re.sub(pattern, str(value), content, flags=re.IGNORECASE)
    
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

def generate_scaffolds(csv_path):
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Get the root directory
    root_dir = Path.cwd().parent
    
    # Define template directories
    template_dir = root_dir / '.20XX_template'
    data_template_dir = root_dir / '_data' / '.template'
    
    # Track which years we've already processed
    processed_years = set()
    
    # Process each row in the CSV
    for _, row in df.iterrows():
        year = str(row['YEAR'])
        name = str(row['NAME'])

        path_name = clean_path_name(name)
        
        # Create replacements dictionary from CSV columns
        replacements = {col: str(value) for col, value in row.items()}
        
        # Create year directories if they don't exist
        year_dir = root_dir / year
        year_dir.mkdir(exist_ok=True)
        (root_dir / '_data' / year).mkdir(parents=True, exist_ok=True)
        
        # Copy and process index.md for new year folders
        if year not in processed_years:
            index_source = template_dir / 'index.md'
            index_target = year_dir / 'index.md'
            shutil.copy2(index_source, index_target)
            replace_tags_in_file(index_target, replacements)
            processed_years.add(year)
        
        # Check for existing directories and skip if found
        target_dir = year_dir / path_name
        data_target_dir = root_dir / '_data' / year / path_name
        
        if target_dir.exists() or data_target_dir.exists():
            print(f"Warning: Skipping {year}/{path_name} - directory already exists")
            continue
            
        # Create main site scaffold
        shutil.copytree(template_dir / '[[NAME]]', target_dir)
        
        # Create data scaffold
        shutil.copytree(data_template_dir / '[[NAME]]', data_target_dir)
        
        # Replace tags in all files in both directories
        for dir_path in [target_dir, data_target_dir]:
            for file_path in dir_path.rglob('*'):
                if file_path.is_file() and file_path.suffix in ['.md', '.yml']:
                    replace_tags_in_file(file_path, replacements)

In [56]:
generate_scaffolds('sites.csv')



Test csv file generation

In [5]:
# This cell can be used to generate testing data for this script.

data = {
    'YEAR': [2029, 2029, 2029, 2029, 2029],
    'NAME': ['winter-festival', 'spring-market', 'summer-fair', 'autumn-expo', 'holiday-bazaar'],
    'LOCATION': ['Central Park', 'Downtown Plaza', 'Riverside Park', 'Convention Center', 'Main Street'],
    'START_DATE': ['December 1', 'March 15', 'June 20', 'September 5', 'December 10'],
    'END_DATE': ['December 15', 'March 30', 'June 25', 'September 10', 'December 20']
}

df_test = pd.DataFrame(data)
# df_test.to_csv('sites.csv', index=False)  # uncomment this if you want it to save 