# Trial transcript yes/no analysis

This code will read trial transcript PDFs and for each witness (and each questioner) quantify how many yes/no questions that witness is asked.

Authors: Chris Iyer, Miles Zoltak
Updated: 5/21/2024

Input:
- file path of folder containing transcript PDFs (currently, this should be run separately for each case/trial)

Output:
- writes a text file containing witness statistics, for each examiner, of # of yes/no questions and # total questions

NOTE: there are some instances where the PDF reader just misses some lines, so this won't be 100% accurate. This code contains a couple shortcuts for guessing information that was lost in the PDF reading process.

For example, look at `12RT.pdf` page 95 // loc 1721. Compare to `entire_transcript[1984825: 1984890]` or `lines[77314:77316]`--these are missing two lines between "DIRECT EXAMINATION" and "A. POLICE OFFICER WITH THE..."

In this example, the examiner is not identified, and the question asked is not identified. This is rare. But in these rare cases, we will guess who the examiner is, and try to infer whether it was a yes/no question from the answer.

# Install all dependencies

In [9]:
%pip install pypdf
%pip install tqdm

%pip install transformers
%pip install torch
%pip install openai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Data loading and processing

In [10]:
import os
dir_path = "example_transcripts"
files = [f for f in sorted(os.listdir(dir_path))]

In [11]:
# Read all the PDFs into a huge string, and then split into a big list of lines
from pypdf import PdfReader
from tqdm import tqdm

entire_transcript = ""

print('Processing PDFs to text...')
for file in tqdm(files, total=len(files)):
  reader = PdfReader(os.path.join(dir_path, file))
  for page in reader.pages:
    entire_transcript += page.extract_text() + '\n'
lines = entire_transcript.split('\n')
print('finished!\n')

Processing PDFs to text...


100%|██████████| 33/33 [00:46<00:00,  1.40s/it]

finished!






# Process transcript into witness statistics

In [12]:
############## HELPER FUNCTIONS ##############
import re

def clean_simple_line(line):
    # removes punctuation/whitespace/numbers/non-letters
    return re.sub(r'[^a-zA-Z\s]', '', line).upper().strip() 

def line_is_witness_identifier(lines, i):
    line = clean_simple_line(lines[i])
    words = line.split(' ')
    return len(words) < 6 and i < len(lines)-1 and ' as a witness' in lines[i+1].lower()

def who_presents_this_witness(lines, witness_line_i): 
    for j in range(witness_line_i+1, witness_line_i+5): # scan the next few lines for keywords
        if 'people' in lines[j].lower():
            return 'people'
        if 'defense' in lines[j].lower() or 'defendant' in lines[j].lower():
            return 'defense'
    return 'unknown'

def line_is_examiner_identifier(line):
    # each examination begins with a line like "By Mr. Smith:"  
    line = re.sub(r'\d+', '', line).strip() # eliminate leading numbers + whitespace. we don't want clean_simple_line because we want to keep colon if there is one
    return len(line.split(' ')) < 6 and line[0:2].lower() == 'by' and line.strip()[-1] == ':'

def clean_examiner_name(examiner_line):
    if '.' in examiner_line and ':' in examiner_line:
        name_substr = examiner_line[examiner_line.find('.'):examiner_line.find(':')]
        return clean_simple_line(name_substr)
    
    name_followed_by_colon = [w for w in examiner_line.split(' ') if ':' in w][0]
    return clean_simple_line(name_followed_by_colon)

def line_is_examination_identifier(lines, i):
    line = clean_simple_line(lines[i])
    return len(line.split()) < 4 and 'EXAMINATION' in line and ('CROSS' in line or 'DIRECT' in line) and i < len(lines)-1 and ( 
        line_is_examiner_identifier(lines[i+1]) or lines[i+1].startswith('Q.') or lines[i+1].startswith('A.')
        )

def is_answer(line):
    return line.strip().startswith('A. ') # or line.strip().startswith('THE WITNESS:')

def starts_question(text, current_examiner):
    return 'Q. ' in text or current_examiner+':' in text # and '?' in text

from openai import OpenAI
def is_yes_no(question):
    # returns true if the question is a yes/no question. queries GPT to do so!
    with open('key.txt', 'r') as f:
        KEY = f.read()
    client = OpenAI(api_key=KEY)

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert at identifying yes/no questions, and at analyzing courtroom transcripts."},
            {"role": "user", "content": f"Is the following question a yes or no question? Respond with 'yes' or 'no':\n\nQuestion: {question}"}
        ]
    )
    return 'yes' == completion.choices[0].message.content.strip().lower()



######################### NOT CURRENTLY USING ###################################################################

import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def is_question_backup(text):
    # uses a more sophisticated pre-trained model to identify questions
    tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/question-vs-statement-classifier")
    model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/question-vs-statement-classifier")
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    probabilities = softmax(outputs.logits, dim=1)
    return probabilities[0][1].item() > 0.5  # Probability for 'question' class is above 50%


In [13]:
# there are some instances where the 'examiner identification' line isn't read properly by the pdf reader
# for these, we need a default guess for who the examiner is.
# so, we'll find the first direct examination for each side (people/defense) and save who the examiner is -- this is a good guess

DEFAULT_EXAMINER_KEY = {'people': '', 'defense': ''}
found = {'people': False, 'defense': False}
for i in range(len(lines)):
    if line_is_witness_identifier(lines, i):
        side = who_presents_this_witness(lines, i)
        if side != 'unknown' and not found[side]:
            # search the next 200 lines for a direct exam, if found one then get the examiner ID
            direct_exam_found = True
            for j,line in enumerate(lines[i:i+200]):
                if line_is_examination_identifier(lines, i+j) and 'DIRECT' in line:
                    direct_exam_found = True
                if direct_exam_found and line_is_examiner_identifier(line):
                    DEFAULT_EXAMINER_KEY[side] = clean_examiner_name(line)
                    found[side] = True
                    break
    if found['people'] and found['defense']: 
        break
    
print('Default examiner default guesses: ', DEFAULT_EXAMINER_KEY, '\nIf these look incorrect, please stop and revise.')


def guess_examiner(witness_side, current_examination):
    print('Examiner not found, guessing from previous records (this message should be rare).')
    if 'DIRECT' in current_examination.upper():
        return DEFAULT_EXAMINER_KEY[witness_side]
    elif 'CROSS' in current_examination.upper():
        other_side = [i for i in DEFAULT_EXAMINER_KEY.keys() if i != witness_side][0]
        return DEFAULT_EXAMINER_KEY[other_side]
    return 'error: unknown examiner'

Default examiner default guesses:  {'people': 'ARNOLD', 'defense': 'JAFFE'} 
If these look incorrect, please stop and revise.


In [14]:
# loop through lines and compile statisticsb
name_to_stats = {}

current_witness = ''
current_witness_side = ''
current_examination = ''
current_examiner = ''

active_question = ''

idxs = []

for i,line in enumerate(lines):
    if line_is_witness_identifier(lines, i):
        current_witness = clean_simple_line(line)
        current_witness_side = who_presents_this_witness(lines, i)
        if current_witness not in name_to_stats.keys():
            name_to_stats[current_witness] = {}
        active_question = '' # just in case we get carried away

    elif line_is_examination_identifier(lines, i):
        current_examiner = ''
        current_examination = clean_simple_line(line)
        active_question = ''
        if current_witness == 'DESHAUNNA CODY THOMAS':
            idxs.append(i)

    elif line_is_examiner_identifier(line):
        current_examiner = clean_examiner_name(line)
        active_question = ''

    # when we hit an answer, I want the active_question to be everything since the last question
    elif starts_question(line, current_examiner):
        active_question = line # start adding to active_question

    elif is_answer(line):
        if current_examiner == '': # error in pdf reading: no examiner info 
            current_examiner = guess_examiner(current_witness_side, current_examination)

        if '?' in active_question: # to rule out things like "Q. Good morning."
            if current_examiner not in name_to_stats[current_witness].keys():
                name_to_stats[current_witness][current_examiner] = {'total_questions': 0, 'yes_no_questions': 0}
            name_to_stats[current_witness][current_examiner]['total_questions'] += 1
            if is_yes_no(active_question):
                name_to_stats[current_witness][current_examiner]['yes_no_questions'] += 1

        active_question = '' # reset

    elif active_question:
        active_question += line # if we started a question, add this line. resets at every answer or special identifying line
        

# Output .txt file

In [None]:
from datetime import datetime
def get_unique_id(lines):
    for l in lines[0:30]:
        if 'NO. ' in l: # case number
            return 'case-' + l.split('NO. ')[1].strip()
    return datetime.now().strftime('date-%Y-%m-%d_%H-%M')


output_text = 'Witness Yes/No Question Statistics \n\n'

for name,values in name_to_stats.items():
    output_text += f'Witness: {name}\n'
    for examiner, stats in values.items():
        output_text += f'\tExaminer: {examiner}\n'
        output_text += f'\t\t Yes/no questions: {stats["yes_no_questions"]}\n'
        output_text += f'\t\t Total questions: {stats["total_questions"]}\n'

        try:
            percentage = round(stats['yes_no_questions'] / stats['total_questions'] * 100, 2)
        except:
             percentage = 'error: no questions'
        output_text += f'\t\t Yes/no percentage: {percentage}%\n'
    output_text += '\n'

with open(f'yn_transcript_output_{get_unique_id(lines)}.txt', 'w') as file: # CHANGE FILENAME TO UNIQUE ID
    file.write(output_text)

# Debugging

In [None]:
# THERE ARE MANY PROBLEMS
# lines that are either questions or answers, that don't start with Q. or A. when the pdf is read by this software :(
# this gets all the indices of those -- fix later

current = ''
i1 = []
i2 = []
for i,line in enumerate(lines):
    if 'THE COURT' in line or 'OBJECTION' in line.upper():
        current = ''
    if 'Q. ' in line[0:10]:
        if current=='question':
            i1.append(i)
        current = 'question'
    if 'A. ' in line[0:10]:
        if current=='answer':
            i2.append(i)
        current = 'answer'
print(len(i1), len(i2))

506 1244
