# Module 2: Transcript Processing
- Step 1: Transcribe meeting audio
- Step 2: Repunctuate raw transcript and split into sentences
- Step 3: MANUAL sentence classification
- Step 4: Segment transcription based on manual sentence classification
- Step 5: MANUALLY match transcript segments to agenda segments of same meeting's agenda
- Step 6: Combine agenda, legislation, and transcript segments to form combined segments

Specify the week of meetings to process.

In [None]:
#### EDIT THIS
MONDAY_DATE = "YYYYMMDD"
FRIDAY_DATE = "YYYYMMDD"
#### EDIT THIS

In [None]:
from datetime import datetime
WEEK = (MONDAY_DATE, FRIDAY_DATE)
START_DATE = datetime.strptime(WEEK[0], "%Y%m%d")
END_DATE = datetime.strptime(WEEK[1], "%Y%m%d")

## Step 1: Transcribe meeting audio

In [None]:
from pathlib import Path
import whisper
import regex as regex
from datetime import datetime

The folder `___input/audio/` should contain a `.wav` audio file for each city council meeting. Each `.wav` file should be named such that the first 8 characters are the meeting date in YYYYMMDD format.

In [None]:
INPUT_AUDIO_PATH = Path("../___input/audio/")
assert INPUT_AUDIO_PATH.exists()

The agenda `.txt` files containing the transcript will be saved to `_interim/transcript/`. 

In [None]:
TRANSCRIPT_PATH = Path("../_interim/transcript/")
TRANSCRIPT_PATH.mkdir(parents=True, exist_ok=True)

Transcribe the audio with OpenAI Whisper

In [None]:
asr_model = whisper.load_model("large")
punct_model = PunctuationModel()

In [None]:
for audio_file in INPUT_AUDIO_PATH.glob("*.wav"):
        
    # check if meeting took place in specified week
    date_str = audio_file.stem[:8]
    try:
        file_date = datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        # skip, meeting does not valid date
        continue
    if file_date < START_DATE or file_date > END_DATE:
        # skip, meeting did not take place in specified week
        continue

    # transcribe audio
    stem = audio_file.stem
    transcript_txt_path = TRANSCRIPT_PATH / f"{stem}.txt"
    result = asr_model.transcribe(str(audio_file), language="en")

    with open(transcript_txt_path, "w", encoding="utf-8") as f:
        f.write(result["text"])

## Step 2: Repunctuate raw transcript and split into sentences

In [None]:
import csv
import re
import pandas as pd
from pathlib import Path
import regex as regex
from deepmultilingualpunctuation import PunctuationModel
from datetime import datetime

The folder `_interim/transcript/` should now contain a `.txt` file containing the transcribed audio of each meeting. The sentences of the transcript are saved to the folder `_interim/transcript_sentences/` and the cleaned transcript text are saved back to the original destination.

In [None]:
assert TRANSCRIPT_PATH.exists()
TRANSCRIPT_SENTENCES_PATH = Path("../_interim/transcript_sentences/")
TRANSCRIPT_SENTENCES_PATH.mkdir(parents=True, exist_ok=True)

Remove punctuation, clean raw text, repunctuate, and then split into sentences.

In [None]:
for raw_transcript_file in TRANSCRIPT_PATH.glob("*.txt"):

    # check if meeting took place in specified week
    date_str = raw_transcript_file.stem[:8]
    try:
        file_date = datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        # skip, meeting does not valid date
        continue
    if file_date < START_DATE or file_date > END_DATE:
        # skip, meeting did not take place in specified week
        continue


    # clean and remove punctuation
    with open(raw_transcript_file, "r", encoding="utf-8") as f:
        raw = f.read()

    raw = re.sub(r"\[.*?\]", "", raw)
    # remove punctuation-based segment endings
    raw = raw.replace(". ", " ").replace("? ", " ").replace("! ", " ").replace(", ", " ")
    # remove non-Latin characters
    raw = regex.sub(r'[^\p{Latin}\d\p{P}\s]', '', raw)
    # normalize whitespace and remove filler words
    raw = re.sub(r'\b(?:uh|um)+\b[,.]?\s*', '', raw, flags=re.IGNORECASE)
    raw = re.sub(r'\s+', ' ', raw)
    clean =  raw.strip()

    # repunctuate text
    punctuated_text = punct_model.restore_punctuation(clean)

    # split into sentences
    sentences = re.findall(r'[^.!?]*[.!?]', punctuated_text)
    sentences = [s.strip() for s in sentences if s.strip()]


    # save sentences to .csv file
    stem = raw_transcript_file.stem
    csv_output_path = TRANSCRIPT_SENTENCES_PATH / f"{stem}.csv"
    with open(csv_output_path, "w", newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["sentence", "classification"])
        for sentence in sentences:
            writer.writerow([sentence, "x"])

    # convert sentences to cleaned transcript and save cleaned transcript
    clean_transcript_file_path = TRANSCRIPT_PATH / f"{stem}.txt"
    df = pd.read_csv(csv_output_path)
    full_text = " ".join(df["sentence"].dropna().astype(str))
    with open(clean_transcript_file_path, "w", encoding="utf-8") as f:
        f.write(full_text)


## Step 3: MANUAL transcript segmentation via sentence classification

In [None]:
import pandas as pd
from pathlib import Path

Now, the transcript sentences should be saved to the folder `_interim/transcript_sentences/`. 

In [None]:
assert TRANSCRIPT_SENTENCES_PATH.exists()

In [None]:
for transcript_sentences_file in TRANSCRIPT_SENTENCES_PATH.glob("*.csv"):
    
    # check if meeting took place in specified week
    date_str = transcript_sentences_file.stem[:8]
    try:
        file_date = datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        # skip, meeting does not valid date
        continue
    if file_date < START_DATE or file_date > END_DATE:
        # skip, meeting did not take place in specified week
        continue

    df = pd.read_csv(transcript_sentences_file)


    # manual classification instructions
    print("Label each sentence as:")
    print("  b - beginning of new topic")
    print("  p - beginning of new public comment topic")
    print("  i - inside a topic")
    print("Type 'back' to go to the previous sentence.")
    print("Type 'quit' to stop early and save your progress.\n")

    # manual classification loop
    idx = 0
    while idx < len(df):
        sentence = df.loc[idx, "sentence"]
        current_label = df.loc[idx, "classification"]

        print(f"\n{idx + 1}/{len(df)}: {sentence}")
        if current_label:
            print(f"current label: {current_label}")

        user_input = input("Label (b/p/i, or 'back', 'quit'): ").strip().lower()

        if user_input in {"b", "p", "i"}:
            df.at[idx, "classification"] = user_input
            idx += 1
        elif user_input == "back":
            if idx > 0:
                idx -= 1
            else:
                print("!!!! ALREADY AT BEGINNING")
        elif user_input == "quit":
            break
        else:
            print("!!!! INVALID, PLEASE ENTER 'b', 'p', 'i', 'back', or 'quit'")

    # save classifications
    df.to_csv(transcript_sentences_file, index=False)
        

## Step 4: Segment transcript using manual classification of sentences

In [None]:
import pandas as pd
import csv
from pathlib import Path

The folder `_interim/transcript_sentences/` should have a `.csv` file for each meeting, containing a column for the sentences and another column for the manual classification. The segments will be saved to the folder `_interim/transcript_segments/`.

In [None]:
assert TRANSCRIPT_SENTENCES_PATH.exists()
TRANSCRIPT_SEGMENTS_PATH = Path("../_interim/transcript_segments/")
TRANSCRIPT_SEGMENTS_PATH.mkdir(parents=True, exist_ok=True)

Combine adjacent sentences in the transcript into segments. The `b` and `p` labels denote beginning of new segments. The `i` label denotes a continuation of a segment. Do not include segments that start with `p` sentences in the final segment file. 

In [None]:
for transcript_sentences_file in TRANSCRIPT_SENTENCES_PATH.glob("*.csv"):

    # check if meeting took place in specified week
    date_str = transcript_sentences_file.stem[:8]
    try:
        file_date = datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        # skip, meeting does not valid date
        continue
    if file_date < START_DATE or file_date > END_DATE:
        # skip, meeting did not take place in specified week
        continue


    # combine adjacent sentences into segments based on each sentence's classification
    df = pd.read_csv(transcript_sentences_file)
    # remove irrelevant rows
    df = df[df["classification"] != "f"].reset_index(drop=True)
    # skip any initial content before the first 'b' or 'p'
    start_index = df[df["classification"].isin(["b", "p"])].index.min()
    df = df.iloc[start_index:].reset_index(drop=True)

    # do not include public comment segments
    segments = []
    current_segment = ""
    is_public_comment = False
    in_segment = False
    for _, row in df.iterrows():
        label = row["classification"]
        sentence = row["sentence"]

        if label == "i":
            current_segment += sentence + " "
        else:
            # iff we're finishing a segment, and it's not a public comment, store it
            if in_segment and not is_public_comment:
                segments.append(current_segment.strip())

            current_segment = sentence + " "
            in_segment = True
            is_public_comment = (label == "p")

    # handle final segment
    if in_segment and not is_public_comment:
        segments.append(current_segment.strip())

    # save segments to .csv
    segments_file_path = TRANSCRIPT_SEGMENTS_PATH / f"{transcript_sentences_file.stem}.csv"
    with open(segments_file_path, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["segment", "matched_agenda_segment"])
        for segment in segments:
            writer.writerow([segment, -1])

## Step 5: MANUALLY match transcript segment to corresponding agenda segment

In [None]:
import pandas as pd
from pathlib import Path

The folder `_interim/transcript_segments/` should have a `.csv` file for each meeting, containing a column for the segments of the transcript. The folder `_interim/agenda_segments/` should have a `.csv` for each meeting containing the agenda segments.

In [None]:
assert TRANSCRIPT_SEGMENTS_PATH.exists()
AGENDA_SEGMENTS_PATH = Path("../_interim/agenda_segments/")
assert AGENDA_SEGMENTS_PATH.exists()

Manually match each transcript segment to the closest aligned agenda segment from the same meeting's agenda.

In [None]:
for transcript_segment_file in TRANSCRIPT_SEGMENTS_PATH.glob("*.csv"):
    
    # check if meeting took place in specified week
    date_str = transcript_segment_file.stem[:8]
    try:
        file_date = datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        # skip, meeting does not valid date
        continue
    if file_date < START_DATE or file_date > END_DATE:
        # skip, meeting did not take place in specified week
        continue


    # manually match each transcript segment to the index of the closest aligned agenda segment from the same meeting's agenda
    agenda_df = pd.read_csv(AGENDA_SEGMENTS_PATH / f"{transcript_segment_file.stem}.csv")
    transcript_df = pd.read_csv(transcript_segment_file)
    for idx, row in transcript_df.iterrows():
        print(f"transcript segment {idx}: {row['segment']}")
        
        while True:
            user_input = input(f"enter the index of the closest aligned agenda segment (0-{len(agenda_df)-1} inclusive). enter -1 if no match: ").strip()
            try:
                user_input = int(user_input)
                if user_input < -1 or user_input >= len(agenda_df):
                    continue
                break
            except:
                pass
        
        transcript_df.loc[idx, "matched_agenda_segment"] = user_input

    transcript_df.to_csv(transcript_segment_file, index=False)




## Step 6: Combine agenda segments with its matched legislation and manually matched transcript segments

In [None]:
import pandas as pd
from pathlib import Path

The folder `_interim/transcript_segments/` should have a `.csv` file for each meeting, containing a column for the segments of the transcript. The folder `_interim/agenda_segments/` should have a `.csv` for each meeting containing the agenda segments.

In [None]:
assert TRANSCRIPT_SEGMENTS_PATH.exists()
assert AGENDA_SEGMENTS_PATH.exists()

In [None]:
for transcript_file in TRANSCRIPT_SEGMENTS_PATH.glob("*.csv"):

    # check if meeting took place in specified week
    date_str = transcript_file.stem[:8]
    try:
        file_date = datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        # skip, meeting does not valid date
        continue
    if file_date < START_DATE or file_date > END_DATE:
        # skip, meeting did not take place in specified week
        continue


    agenda_file = AGENDA_SEGMENTS_PATH / f"{transcript_file.stem}.csv"
    if not agenda_file.exists():
        print(f"!!!! AGENDA FILE NOT FOUND FOR: {transcript_file}")
        continue



    # for each transcript segment, assign it to its matched agenda segment's row
    agenda_df = pd.read_csv(agenda_file)
    transcript_df = pd.read_csv(transcript_file)

    agenda_df["matched_transcript"] = "NO_TRANSCRIPT"

    for idx, row in transcript_df.iterrows():
        
        # no match index code
        agenda_idx = row["matched_agenda_segment"]
        if agenda_idx == -1:
            continue

        if agenda_df.loc[agenda_idx, "matched_transcript"] == "NO_TRANSCRIPT":
            agenda_df.loc[agenda_idx, "matched_transcript"] = row["segment"] + "\n"
        else:
            agenda_df.loc[agenda_idx, "matched_transcript"] = agenda_df.loc[agenda_idx, "matched_transcript"] + row["segment"] + "\n"


    # create combined segments by combining agenda segments, matched legislation, and matched transcript segment
    agenda_df["combined_segment"] = "NO_SEGMENT"
    for agenda_idx, agenda_row in agenda_df.iterrows():

        # not in transcript, so can skip
        if agenda_row.loc[agenda_idx, "matched_transcript"] == "NO_TRANSCRIPT":
            continue


        combined_text = (
            "**Section of meeting agenda:**\n"
            f"{agenda_row.get('agenda_segment', 'NO_AGENDA')}\n\n"
            "**Section of meeting legislation:**\n"
            f"{agenda_row.get('matched_legislation', 'NO_LEGISLATION')}\n\n"
            "**Section of meeting transcript:**\n"
            f"{agenda_row.get('matched_transcript', 'NO_TRANSCRIPT')}\n\n"
        )

        agenda_df.loc[agenda_idx, "combined_segment"] = combined_text


    # save results
    agenda_df.to_csv(agenda_file, index=False)

This concludes Module 2: Transcript Processing.