In [None]:
import duckdb
import gzip
import os


def peek_gz_with_duckdb(file_path, num_lines=5):
    """
    Peeks at the first few lines of a .gz file using DuckDB.

    Args:
        file_path (str): The path to the .gz file.
        num_lines (int): The number of lines to peek at (default is 5).
    """
    try:
        with gzip.open(file_path, "rt") as f:
            # Read the first few lines manually
            peek_data = [next(f) for _ in range(num_lines)]
            print("First few lines (manual read):")
            for line in peek_data:
                print(line.strip())

        # Use DuckDB to query the first few lines (demonstration)
        con = duckdb.connect(database=":memory:", read_only=True)
        query = f"""
            SELECT *
            FROM read_csv_auto('{file_path}', compression='gzip')
            LIMIT {num_lines};
        """
        result = con.execute(query).fetchall()
        con.close()

        print(f"\nFirst {num_lines} lines (DuckDB query):")
        for row in result:
            print(row)

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
    except gzip.BadGzipFile:
        print(f"Error: '{file_path}' does not appear to be a valid gzip file.")
    except StopIteration:
        print(f"Note: The file '{file_path}' has fewer than {num_lines} lines.")
    except Exception as e:
        print(f"An error occurred: {e}")


if __name__ == "__main__":
    # Example usage: Create a sample .gz file

    peek_gz_with_duckdb(
        "/Users/christophersteinberg/Documents/GitHub/downthestreet/data/urls-meta-23-11-02.csv.gz",
        num_lines=3,
    )

    # Clean up the sample file


In [1]:
import duckdb
import pandas as pd

def peak_parquet(file_path, num_rows=200):
    """
    Peaks at the contents of a Parquet file using DuckDB.

    Args:
        file_path (str): The path to the Parquet file.
        num_rows (int, optional): The number of rows to display. Defaults to 5.

    Returns:
        pandas.DataFrame: A Pandas DataFrame containing the first 'num_rows' of the file,
                          or None if the file cannot be read.
    """
    try:
        con = duckdb.connect(database=':memory:', read_only=False)
        query = f"SELECT * FROM '{file_path}' LIMIT {num_rows}"
        result = con.execute(query).fetchdf()

        length = f"SELECT COUNT(*) FROM '{file_path}'"
        total_length = con.execute(length).fetchdf()
        print(total_length)
        con.close()

        if not result.empty:
            print(f"Peeking at the first {num_rows} rows of '{file_path}':")
            print(result.to_string())
            return result
        else:
            print(f"The Parquet file '{file_path}' is empty.")
            return None

    except Exception as e:
        print(f"Error peeking at '{file_path}': {e}")
        return None

if __name__ == '__main__':
    peak_parquet("../data/about_pages.parquet")

   count_star()
0         12259
Peeking at the first 200 rows of '../data/about_pages.parquet':
                                                                                                               url
0                                                                                 https://philosophy.gmu.edu/about
1                                                            https://www.wbur.org/podcasts/anythingforselena/about
2                                                                                      https://www.twice.com/about
3                                                                                       https://www.tedleo.com/bio
4                                                    https://stmarksutah.org/homilyarchive/2022/06/02/presentation
5                                                                         https://www.ozarkactorstheatre.org/about
6                                                                               https://www.linuxba