# Trial transcript yes/no analysis

This code will read trial transcript PDFs and for each witness (and each questioner) quantify how many yes/no questions that witness is asked.

Authors: Chris Iyer, Miles Zoltak
Updated: 5/20/2024

Input:
- file path of folder containing transcript PDFs

Desired output:
- quantified # of yes/no questions + total questions asked to each witness, by each questioner

Pipeline:
- Extract text from PDF with OCR
- Identify start of each witness's questioning
- Identify questions, and who is asking them
- Classify questions as yes/no or not

# Data loading and processing

In [1]:
import os
dir_path = "example_transcripts"
files = [f for f in sorted(os.listdir(dir_path))]

In [2]:
!pip install pypdf
!pip install tqdm

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
     ---------------------------------------- 0.0/290.4 kB ? eta -:--:--
     -------------------------------------- 290.4/290.4 kB 9.0 MB/s eta 0:00:00
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


In [6]:
from pypdf import PdfReader
from tqdm import tqdm

reader_dict = {fname: PdfReader(os.path.join(dir_path, fname)) for fname in files}

print('Processing PDFs to text...')
# pages_dict = {fname: [page.extract_text() for page in reader.pages] for fname, reader in reader_dict.items()}
pages_dict = {}
for fname, reader in tqdm(reader_dict.items(), total=len(reader_dict)):
  pages_dict[fname] = [page.extract_text() for page in reader.pages]
print('finished!\n')

Processing PDFs to text...


100%|██████████| 33/33 [00:33<00:00,  1.00s/it]

finished!






# Second bullet point idk

In [None]:
import re

first_real_page = pages1[2]

# print get rid of watermark
watermark_regex = r"^\s*ROUGH DRAFT"
first_real_page = re.sub(watermark_regex, '', first_real_page)


# for each line, remove a leading line number
line_numbers_regex = r"^\d+$"
lines = first_real_page.split("\n")
processed_lines = []
for line in lines:
  processed_lines.append(re.sub(line_numbers_regex, "", line))

first_real_page = "\n".join(processed_lines)

# strip off any leading or trailing whitespace
first_real_page = first_real_page.strip()
print(first_real_page)

In [None]:
def remove_watermark(page):
  watermark_regex = r"^\s*ROUGH DRAFT"
  page = re.sub(watermark_regex, '', page)
  return page

def remove_line_numbers(page):
  line_numbers_regex = r"^\d+$"
  lines = page.split("\n")
  processed_lines = []
  for line in lines:
    processed_lines.append(re.sub(line_numbers_regex, "", line))

  page = "\n".join(processed_lines)
  return page

def process_page(page):
  # remove watermarks and line numbers
  page = remove_watermark(page)
  page = remove_line_numbers(page)

  # strip off any leading or trailing whitespace
  page = page.strip()

  return page

def process_transcript(transcript_pages):
  # drop the first two pages
  transcript_pages = transcript_pages[2:]

  # process each page
  processed_pages = [process_page(page) for page in transcript_pages]

  return processed_pages

In [None]:
t1 = process_transcript(pages1)
t2 = process_transcript(pages2)

In [None]:
def extract_name(line):
  colon_idx = line.find(":")
  if colon_idx == -1:
    return None

  name = line[:colon_idx].strip()
  return name if name == name.upper() else None

In [None]:
stop = False
for i, page in enumerate(t1):
  lines = page.split("\n")
  for j, line in enumerate(lines):
    name = extract_name(line)
    # if name: print(name
    if name == "BY MS. FOG":
      print(t1[i])
      print(i)
      stop = True
      break
  if stop: break

In [None]:
pf_path_clean = "drive/My Drive/pdf_stuff/pulp_fiction_cleaner.pdf"
pf_reader_clean = PdfReader(pf_path_clean)
pf_pages_clean = [page.extract_text().lower() for page in pf_reader_clean.pages]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/question-vs-statement-classifier")

model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/question-vs-statement-classifier")


In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/question-vs-statement-classifier")
model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/question-vs-statement-classifier")

def classify_text(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt")

    # Perform inference
    outputs = model(**inputs)

    # Apply softmax to logits
    probabilities = F.softmax(outputs.logits, dim=1)

    # Get probability of being a question
    probability_question = probabilities[0][1].item()  # Probability for 'question' class

    return probability_question

In [None]:
questions = []
for page in pf_pages_clean[1:]:
  lines = page.split("\n")
