# Text Extract Evaluation

Evaluate the text extraction process for the voting minutes PDFs.

For chunking, we want to extract the text from the PDFs and then chunk it into section that can be evaluated for extract into the license schema.

In [None]:
# After running this cell, we should be in the voting_minutes_txt directory
!echo $PWD
%cd ../voting_minutes_txt

In [None]:
# How many files are in the voting_minutes_txt directory?

# (subtract 1 to account for the header row)
!expr $(ls -l | wc -l) - 1

In [None]:
# How many files contain the string "Transactional Hearing"?
# These words indicate the beginning of the Transactional Hearing minutes

# (-i ignores case)
# (-H prints the filename)
!grep -iH "Transactional Hearing" *.txt | wc -l

# Since we have 139 files and 160 instances of the string "Transactional Hearing",
# This tells us that some files contain multiple instances of the string "Transactional Hearing"

In [None]:
# How many files do not contain the string "Transactional Hearing"?
!grep -iL "Transactional Hearing" *.txt

# Only 2 files do not contain the string "Transactional Hearing"
# This is a good sign!

In [None]:
#!cat voting_minutes_2021-01-21.txt
#!cat voting_minutes_2021-04-20.txt

# The two non conforming files where Emergency Licensed Premise Inspection Hearings
# and had no Transactional Hearing minutes


In [None]:
import os


def process_txt_files_in_dir(directory, function):
    results = {}

    # Iterate over all .txt files
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            result = function(filename)
            results[filename] = result

    return results


def count_lines_in_file(file_path):
    line_counts = {}

    with open(file_path, encoding="utf-8") as f:
        for line in f:
            stripped_line = line.strip()
            if not stripped_line:
                continue  # skip empty lines

            if stripped_line in line_counts:
                line_counts[stripped_line] += 1
            else:
                line_counts[stripped_line] = 1
    return line_counts

In [None]:
import pandas as pd


def aggregate_line_counts(results, csv_path):
    """
    Build a dataframe with one row per unique line and the total occurrences
    across all files. Save to csv.

    Args:
        results (dict): { filename: { line: count, ... }, ... }
        csv_path (str): output CSV path

    Returns:
        pd.DataFrame: dataframe with columns ['unique_line','total_occurrences']
    """
    # Accumulate totals
    totals = {}
    for _filename, line_counts in results.items():
        for line, cnt in line_counts.items():
            totals[line] = totals.get(line, 0) + cnt

    # Build DataFrame
    df = pd.DataFrame(
        [
            {"unique_line": line, "total_occurrences": cnt}
            for line, cnt in totals.items()
        ]
    )

    # Sort by total_occurrences desc, then unique_line asc
    df.sort_values(
        by=["total_occurrences", "unique_line"], ascending=[False, True], inplace=True
    )

    # Save CSV
    df.to_csv(csv_path, index=False, encoding="utf-8")

    return df

In [None]:
# Example usage
results = process_txt_files_in_dir("./", count_lines_in_file)
df = aggregate_line_counts(results, csv_path="../aggregated_line_counts.csv")
print(df.head())

In [None]:
results = process_txt_files_in_dir("./", count_lines_in_file)
print(len(results))
for key in results.keys():
    if key.startswith("voting_minutes_2024-06-06.txt"):
        print(key)
        line_counts = results[key]
        sorted_lines = sorted(
            line_counts.items(),
            key=lambda x: (
                -x[1],
                x[0],
            ),  # sort by count descending, then line ascending
        )
        for line, count in sorted_lines:
            print(f"{line}: {count}")

In [None]:
file_path = "voting_minutes_2025-05-01.txt"
line_counts = count_lines_in_file(file_path)

# Convert to list of tuples and sort
sorted_lines = sorted(
    line_counts.items(),
    key=lambda x: (-x[1], x[0]),  # sort by count descending, then line ascending
)

for line, count in sorted_lines:
    print(f"{count}x: {line}")

In [None]:
def read_txt_files_in_dir(directory, function):
    results = {}

    # Iterate over all .txt files
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            result = function(filename)
            results[filename] = result

    return results

In [None]:
# Example usage
directory_path = "."  # replace with your directory
line_data = read_txt_files_in_dir(directory_path)
print(len(line_data))