In [7]:
import csv
import re
import PyPDF4

In [8]:
depopath = './depos/'
csvpath = './csvs/'

filename = 'sample.pdf'

In [9]:
filepath = depopath + filename
outpath = csvpath + filename

In [10]:
# if .txt, readlines()
# if .pdf, convert PDF to text
if (filepath.split('.')[-1] == "txt"):
    depo = open(filepath, 'r')
    lines = depo.readlines()
else:
    depo = open(filepath, 'rb')
    pdfReader = PyPDF4.PdfFileReader(depo)
    
    # extracting text from page
    lines = []

    for i in range(pdfReader.numPages):
        # ignore last 3 lines, per veritext standard (page #, veritext, phone #)
        lines.extend(pdfReader.getPage(i).extractText().splitlines()[:-3])

depo.close()

In [15]:
# Regex pattern for timestamp to remove
r = '[0-9][0-9]:[0-9][0-9]:[0-9][0-9]'

# 'Higly Confidential Attorney's Eyes Only' statement or 'Page ##' match string
eyes =  ["\*\*", "Page[s]* [0-9]+"]

# Questioner list, includes all speakers that are not answering
qs = ["Q\.  ", "Q  ", "[A-Z]+:  "]

# Responder list
ans = ["A.  ", "A  ", "THE WITNESS:  ", "THE DEPONENT:  "]

In [16]:
# Leading characters to ignore
leading_chars = 0

for line in lines:
    if (line.strip()) and ("  1  " in line):
        leading_chars = re.search("  1  ", line).start() + 3
        break

In [17]:
# Speaker identifier and list indexer
cur_speaker = -1

# Questioning start marker
started = 0

# Output in the format of [[Q,A], [Q,A], ...] **A may be empty
out_list = []

# Main loop
for line in lines:

    # Clean line:
    #   remove leading_chars
    #   regex match and remove timestamp
    #   strip() leading and tailing spaces
    line = line[leading_chars:]
    line = re.sub(r,'',line).strip()

    # Questioning starts at the first mention of 'Q  ' or 'Q.  '
    if (not started) and any(re.match(q, line) for q in qs[:-1]):
        started = 1

    # 1) skip line if line is a) empty, b) page number, or c) confidential statement
    # 2) if answer, set cur_speaker to 1 **A check before Q check to override ":  " match
    # 3) if question, set cur_speaker to 0 and append a new sublist if questioning has begun
    if (line == "") or (line.isdecimal()) or (any(re.match(eye, line) for eye in eyes)):
        continue
    elif any(re.match(a, line) for a in ans):
        cur_speaker = 1
    elif any(re.search(q, line) for q in qs):
        cur_speaker = 0
        out_list.append(["",""]) if started else None
    
    # Append line to output if questioning has begun
    if started:
        out_list[-1][cur_speaker] = out_list[-1][cur_speaker] + line + " "
    
    # Do not capture index
    if "Deposition ended at" in line:
        break

In [18]:
with open('{}.csv'.format(outpath), 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerows(out_list)