# Shopping List Generator
* Read files scraped from Youtube
* Generate lists of items
* Dedup items
* Output csv with unique items and their descriptions

In [1]:
from pathlib import Path
import re

In [2]:
def read_file(file: Path) -> str:
    with open(file, 'r', encoding='utf-8') as f:
        contents = f.read()
        return contents


def parse_amazon_links_with_section(text: str):
    """
    Extract Amazon links with descriptions.
    If sub-items are under a section, prepend the section name.
    """
    # Regex to detect section headings like "Luggage tag holders:"
    section_pattern = re.compile(r'^(?P<section>.+?):\s*$', flags=re.MULTILINE)

    # Regex to find description + amazon link
    link_pattern = re.compile(
        r'(?P<desc>[^:\n]+?):\s*(?P<url>https?://(?:www\.)?(?:amzn\.to|amazon\.[a-z.]+)[^\s)\]]+)',
        flags=re.IGNORECASE
    )

    results = []
    seen = set()
    current_section = None

    lines = text.splitlines()
    for line in lines:
        # Detect section header (e.g., "Luggage tag holders:")
        sec_match = section_pattern.match(line.strip())
        if sec_match:
            current_section = sec_match.group('section').strip()
            continue

        # Look for link matches in the line
        for m in link_pattern.finditer(line):
            desc = m.group('desc').strip()
            url = m.group('url').strip()

            # If inside a section and desc isn't the section itself, prepend
            if current_section and desc.lower() != current_section.lower():
                if desc.startswith('*'):
                    desc = f"{current_section} for {desc.lstrip('*').strip()}"
                else:
                    current_section = None

            # Remove bullets/extra symbols
            desc = re.sub(r'^[\*\-\u2022\u25CF\ufeff\s]+', '', desc)
            desc = re.sub(r'\s+', ' ', desc).strip()

            # Deduplicate
            if url in seen:
                continue
            seen.add(url)

            results.append({'description': desc, 'url': url})

    return results


In [3]:
data_dir = Path('../data')
files = [f for f in data_dir.rglob('*') if f.is_file() and not f.name.endswith("~")]
print(files)

[PosixPath('../data/youtube/bJqGx_JKpVY'), PosixPath('../data/youtube/ZSh3VHkll1M'), PosixPath('../data/youtube/IoCUhUSKj1g')]


In [4]:
text = read_file(data_dir / files[1])

In [5]:
items = parse_amazon_links_with_section(text)

In [6]:
for i, it in enumerate(items, 1):
    print(f"{i:2d}. {it['description']} -> {it['url']}")

 1. Waterproof phone case on lanyard -> https://amzn.to/3iXCuT6
 2. Lanyards -> https://amzn.to/448kKdU
 3. Hat clip -> https://amzn.to/3r78DPz
 4. Sun hat -> https://amzn.to/3rb522W
 5. Iwalk phone charger -> https://amzn.to/3XxltTq
 6. Portable power bank -> https://amzn.to/3pAjhhl
 7. Travel adapter -> https://amzn.to/3pp1kCtMini
 8. white noise machine -> https://amzn.to/44dg2eP
 9. No surge power bar -> https://amzn.to/3NCRu7d
10. Travel containers -> https://amzn.to/3PFlsuF
11. Insulated water bottle -> https://amzn.to/3PGFTrg
12. Collapsible water bottle -> https://amzn.to/3NziMft
13. Money belt -> https://amzn.to/3JBVOmP
14. Compression socks -> https://amzn.to/46K8fHh
15. Pressure relief ear plugs -> https://amzn.to/448l0cS
16. Pill box organizer -> https://amzn.to/3JCTFXW
17. Magnet hooks -> https://amzn.to/3wZgj77
18. Foldable hangers -> https://amzn.to/3JIfKUW
19. Carry on travel bag -> https://amzn.to/3NwzJHv
20. Nautical carry on bag -> https://amzn.to/2TH7L3T
21. Luggage