In [None]:
import duckdb
import gzip
import os


def peek_gz_with_duckdb(file_path, num_lines=5):
    """
    Peeks at the first few lines of a .gz file using DuckDB.

    Args:
        file_path (str): The path to the .gz file.
        num_lines (int): The number of lines to peek at (default is 5).
    """
    try:
        with gzip.open(file_path, "rt") as f:
            # Read the first few lines manually
            peek_data = [next(f) for _ in range(num_lines)]
            print("First few lines (manual read):")
            for line in peek_data:
                print(line.strip())

        # Use DuckDB to query the first few lines (demonstration)
        con = duckdb.connect(database=":memory:", read_only=True)
        query = f"""
            SELECT *
            FROM read_csv_auto('{file_path}', compression='gzip')
            LIMIT {num_lines};
        """
        result = con.execute(query).fetchall()
        con.close()

        print(f"\nFirst {num_lines} lines (DuckDB query):")
        for row in result:
            print(row)

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
    except gzip.BadGzipFile:
        print(f"Error: '{file_path}' does not appear to be a valid gzip file.")
    except StopIteration:
        print(f"Note: The file '{file_path}' has fewer than {num_lines} lines.")
    except Exception as e:
        print(f"An error occurred: {e}")


if __name__ == "__main__":
    # Example usage: Create a sample .gz file

    peek_gz_with_duckdb(
        "/Users/christophersteinberg/Documents/GitHub/downthestreet/data/urls-meta-23-11-02.csv.gz",
        num_lines=3,
    )

    # Clean up the sample file


In [None]:
import duckdb
import pandas as pd

def peak_parquet(file_path, num_rows=5):
    """
    Peaks at the contents of a Parquet file using DuckDB.

    Args:
        file_path (str): The path to the Parquet file.
        num_rows (int, optional): The number of rows to display. Defaults to 5.

    Returns:
        pandas.DataFrame: A Pandas DataFrame containing the first 'num_rows' of the file,
                          or None if the file cannot be read.
    """
    try:
        con = duckdb.connect(database=':memory:', read_only=False)
        query = f"SELECT * FROM '{file_path}' LIMIT {num_rows}"
        result = con.execute(query).fetchdf()
        con.close()

        if not result.empty:
            print(f"Peeking at the first {num_rows} rows of '{file_path}':")
            print(result.to_string())
            return result
        else:
            print(f"The Parquet file '{file_path}' is empty.")
            return None

    except Exception as e:
        print(f"Error peeking at '{file_path}': {e}")
        return None

if __name__ == '__main__':
    peak_parquet("../data/about_pages_3.parquet")

In [None]:
ABOUT_PATTERNS = [
    # English patterns
    "/about",
    "/about-me",
    "/bio",
    "/biography",
    "/me",
    "/who-am-i",
    "/about/me",
    "/hello",
    "/introduction",
    "/personal",
    "/profile",
    "/my-story",
    "/my-journey",
    "/about-the-author",
    "/meet-me",
    # Spanish patterns
    "/sobre-mi",
    "/quien-soy",
    "/acerca-de-mi",
    "/mi-biografia",
    "/mi-historia",
    "/biografia",
    "/presentacion",
    "/perfil",
    "/conoceme",
    "/hola",
    # German patterns
    "/ueber-mich",
    "/über-mich",
    "/ich",
    "/meine-geschichte",
    "/biografie",
    "/das-bin-ich",
    "/steckbrief",
    "/vorstellung",
    "/hallo",
    "/personliches",
    "/persönliches",
    # French patterns
    "/a-propos",
    "/qui-suis-je",
    "/biographie",
    "/ma-bio",
    "/mon-parcours",
    "/me-connaitre",
    "/me-connaître",
    "/presentation",
    "/présentation",
    "/bonjour",
    "/mon-histoire",
]

In [None]:


# Perform the join with simple pattern matching
conn.execute("""
    CREATE TABLE result AS
SELECT 
    nf.*,
    u.url as matched_url,
    u.column2,
    u.column3
FROM normalized_feeds nf
LEFT JOIN urls_meta u
ON (
    -- Handle cases with www
    REGEXP_REPLACE(u.url, 'https?://(?:www\.)?([^/]+).*', '\1') =
    REGEXP_REPLACE(nf.original_url, 'https?://(?:www\.)?([^/]+).*', '\1')
);CREATE TABLE result AS
    SELECT 
        nf.*,
        u.url as matched_url,
        u.column2,
        u.column3
    FROM normalized_feeds nf
    LEFT JOIN urls_meta u
    ON (u.url LIKE 'https://www.' || REPLACE(nf.original_url, 'www.', '') || '%'
        OR u.url LIKE 'https://' || REPLACE(nf.original_url, 'www.', '') || '%');
""")

# Write the results to a new parquet file
conn.execute("""
    COPY result TO 'merged_results.parquet' (FORMAT PARQUET);
""")

# Print some statistics about the join
conn.execute("""
    SELECT 
        COUNT(*) as total_matches,
        COUNT(DISTINCT original_url) as unique_feeds,
        COUNT(DISTINCT matched_url) as unique_matches
    FROM result WHERE matched_url IS NOT NULL;
""").fetchall()

# Close the connection
conn.close()