# Shopping List Generator
* Read files scraped from Youtube
* Generate lists of items
* Dedup items
* Output csv with unique items and their descriptions

In [1]:
from pathlib import Path
from collections import defaultdict
import csv
import re

In [2]:

def get_files():
    data_dir = Path('../data')
    paths = [f for f in data_dir.rglob('*') if f.is_file() and not f.name.endswith('~')]
    return paths

def read_file(file: Path) -> str:
    with open(file, 'r', encoding='utf-8') as f:
        contents = f.read()
        return contents


def parse_amazon_links_with_section(text: str):
    """
    Extract Amazon links with descriptions.
    If sub-items are under a section, prepend the section name.
    """
    # Regex to detect section headings like "Luggage tag holders:"
    section_pattern = re.compile(r'^(?P<section>.+?):\s*$', flags=re.MULTILINE)

    # Regex to find description + amazon link
    link_pattern = re.compile(
        r'(?P<desc>[^:\n]+?):\s*(?P<url>https?://(?:www\.)?(?:amzn\.to|amazon\.[a-z.]+)[^\s)\]]+)',
        flags=re.IGNORECASE
    )

    results = []
    seen = set()
    current_section = None

    lines = text.splitlines()
    for line in lines:
        # Detect section header (e.g., "Luggage tag holders:")
        sec_match = section_pattern.match(line.strip())
        if sec_match:
            current_section = sec_match.group('section').strip()
            continue

        # Look for link matches in the line
        for m in link_pattern.finditer(line):
            desc = m.group('desc').strip()
            url = m.group('url').strip()

            # If inside a section and desc isn't the section itself, prepend
            if current_section and desc.lower() != current_section.lower():
                if desc.startswith('*'):
                    desc = f"{current_section} for {desc.lstrip('*').strip()}"
                else:
                    current_section = None

            # Remove bullets/extra symbols
            desc = re.sub(r'^[\*\-\u2022\u25CF\ufeff\s]+', '', desc)
            desc = re.sub(r'\s+', ' ', desc).strip()

            # Deduplicate
            if url in seen:
                continue
            seen.add(url)

            results.append({'description': desc, 'url': url})

    return results


def merge_items(items_collection: list[list[dict[str, str]]]) -> list[dict[str, list[str]]]:
    url2descs = defaultdict(list)
    for items in items_collection:
        for item in items:
            url2descs[item['url']].append(item['description'])

    result = []
    for k, v in url2descs.items():
        desc = '' if len(v) == 1 else f' ({len(v)})'
        result.append({'url': k, 'description': v[0] + desc})
    return result

def print_items(items, cutoff=None):
    count = 0
    for i, it in enumerate(items, 1):
        print(f"{i:2d}. {it['description']} -> {it['url']}")
        count += 1
        if cutoff and count >= cutoff:
            break

def get_items_collection():
    files = get_files()
    return [parse_amazon_links_with_section(read_file(f)) for f in files]


def get_all_items():
    items_collection = get_items_collection()
    return merge_items(items_collection)


def write_items():
    items = get_all_items()
    output_file = '../data/csv/items.csv'
    with open(output_file, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['description', 'url'])
        writer.writeheader()
        writer.writerows(items)

In [3]:
items = get_all_items()

In [4]:
print_items(items, cutoff=4)

 1. LEVEL 8 Carry on luggage -> https://amzn.to/47dDzhd
 2. Hanging toiletry bag (2) -> https://amzn.to/3Nb8fFY
 3. Packing cubes (2) -> https://amzn.to/3wF1syG
 4. Reef safe sunscreen -> https://amzn.to/3QstR4C


In [5]:
write_items()