Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-implement file reading methods + add unit tests #9

Merged
merged 1 commit into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 91 additions & 18 deletions anonipy/utils/file_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,114 @@
The file system utilities
"""

import os
import re
import json
from typing import Union, Any
from typing import Union

import textract
from docx import Document
from pypdf import PdfReader


# Define namespaces
WORD_NAMESPACES = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

# =====================================
# Helper functions
# =====================================


def text_decode(text: str, decode: Union[str, bool] = True) -> str:
if not decode:
return text
if isinstance(decode, str):
return text.decode(decode)
if isinstance(decode, bool):
return text.decode("utf-8")
def remove_extra_spaces(text: str) -> str:
text = text.strip()
# remove extra spaces
text = re.sub(" +", " ", text)
text = re.sub("\n{2,}", "\n\n", text)
return text


def remove_page_numbers(text: str) -> str:
page_number_pattern = re.compile(r"^\s*\d+\s*$|\s*\d+\s*$")
filtered_lines = [
line.strip()
for line in text.splitlines()
if not page_number_pattern.match(line)
]
return "\n".join(filtered_lines)


# =====================================
# PDF extractor
# =====================================


def extract_text_from_pdf(pdf_path: str) -> str:
pdf_reader = PdfReader(pdf_path)

pages_text = []
for page in pdf_reader.pages:
text = page.extract_text(extraction_mode="layout")
text = remove_page_numbers(text)
text = remove_extra_spaces(text)
pages_text.append(text)
document_text = "\n".join(pages_text)

def get_variable_name(var: Any) -> str:
for name, value in globals().items():
if value is var:
return name
return None
return document_text


# =====================================
# Word extractor
# =====================================


def _word_process_paragraph(p) -> str:
return p.text


def _word_process_table(t) -> str:
table_text = []
for row in t.findall(".//w:tr", WORD_NAMESPACES):
row_text = []
for cell in row.findall(".//w:tc", WORD_NAMESPACES):
cell_text = []
for p in cell.findall(".//w:p", WORD_NAMESPACES):
cell_text.append(p.text)
row_text.append(" ".join(cell_text))
table_text.append(" ".join(row_text))
return "\n".join(table_text)


def extract_text_from_word(doc_path: str) -> str:
doc = Document(doc_path)
content = []
for element in doc.element.body:
if element.tag.endswith("p"):
# element is a paragraph
text = _word_process_paragraph(element)
content.append(text)
elif element.tag.endswith("tbl"):
# element is a table
text = _word_process_table(element)
content.append(text)
document_text = "\n".join(content)
return document_text


# =====================================
# Main functions
# =====================================


def open_file(file_path: str, encode: Union[str, bool] = True) -> str:
text = textract.process(file_path)
text = text_decode(text, encode)
return text
def open_file(file_path: str) -> str:
_, file_extension = os.path.splitext(file_path)
if file_extension.lower() == ".pdf":
return extract_text_from_pdf(file_path)
elif file_extension.lower() in [".doc", ".docx"]:
return extract_text_from_word(file_path)
elif file_extension.lower() == ".txt":
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
else:
raise ValueError(f"The file extension is not supported: {file_extension}")


def open_json(file_path: str) -> dict:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ lingua-language-detector
guidance==0.1.14
sentencepiece
# File readers
textract
pypdf
python-docx
# Monitoring
tqdm
Binary file added test/resources/example.docx
Binary file not shown.
Binary file added test/resources/example.pdf
Binary file not shown.
22 changes: 22 additions & 0 deletions test/resources/example.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Medical Record

Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789

Examination Procedure:

John Doe underwent a routine physical examination. The procedure included measuring vital signs
(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
test. The patient also reported occasional headaches and dizziness, prompting a neurological
assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:

Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.

Next Examination Date:

15-11-2024
67 changes: 67 additions & 0 deletions test/resources/example_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
WORD_TEXT = """\
Medical Record

Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789
Examination Procedure:
John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:
Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.

Next Examination Date:
15-11-2024
""".strip()

PDF_TEXT = """\
Medical Record

Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789

Examination Procedure:

John Doe underwent a routine physical examination. The procedure included measuring vital signs
(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
test. The patient also reported occasional headaches and dizziness, prompting a neurological
assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:

Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.

Next Examination Date:

15-11-2024
""".strip()

TXT_TEXT = """\
Medical Record

Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789

Examination Procedure:

John Doe underwent a routine physical examination. The procedure included measuring vital signs
(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
test. The patient also reported occasional headaches and dizziness, prompting a neurological
assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:

Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.

Next Examination Date:

15-11-2024
""".strip()
34 changes: 34 additions & 0 deletions test/test_file_system.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest

from anonipy.utils.file_system import open_file

# =====================================
# Helper functions
# =====================================

from test.resources.example_outputs import WORD_TEXT, PDF_TEXT, TXT_TEXT

resources = {
"word": "./test/resources/example.docx",
"pdf": "./test/resources/example.pdf",
"txt": "./test/resources/example.txt",
}

# =====================================
# Test Entity Extractor
# =====================================


class TestFileSystem(unittest.TestCase):
def test_open_file_word(self):
self.assertEqual(open_file(resources["word"]), WORD_TEXT)

def test_open_file_pdf(self):
self.assertEqual(open_file(resources["pdf"]), PDF_TEXT)

def test_open_file_txt(self):
self.assertEqual(open_file(resources["txt"]), TXT_TEXT)


if __name__ == "__main__":
unittest.main()
Loading