#This is a demo to read in docx files and see if it works to classify a document

In [1]:
%pip install python-docx -q
%pip install PyMuPDF -q
%pip install transformers -q

## Methods to extract text from docs.

In [9]:
from docx import Document
import pymupdf

def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    # Load the .docx file
    doc = Document(path)
    full_text = ""

    # Read and print the contents of the document
    for paragraph in doc.paragraphs:
        #print(paragraph.text)
        full_text += paragraph.text + "\n"

    return full_text.strip()


def get_pdf_text(path):
    doc = pymupdf.open(path)

    full_text = ""
    for page in doc: # iterate the document pages
        text = page.get_text().encode("utf8") # get plain text (UTF-8)
        full_text = full_text + text.decode("utf-8")

    #print(f'FILE TEXT:\n{full_text}')
    doc.close()
    return full_text

In [11]:
docx_text = get_docx_text('/content/drive/MyDrive/agata/aiml_analytics_charter_090821.docx')
#print(f'docx_text: {docx_text}')
pdf_text = get_pdf_text('/content/drive/MyDrive/agata/20_FAM_301.2_1.pdf')
#print(f'pdf_text: {pdf_text}')

In [4]:
import os
import torch
from transformers import pipeline

# Hugging Face access token
os.environ["HF_TOKEN"] = "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"

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Submit method

In [20]:
instructions = """Answer the user's question <Q> given a Statement of Work <C> and additional data <D>.
        Use ONLY the information in <C> or <D> to answer <Q>. Do not include any additional text.
        Do not include <Q> in your response."""

def get_prompt(question, context, other_info):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": """<Q>""" + question + """</Q>. <C>""" + context + """</C>. <D>""" + other_info + """</D>."""},
    ]
    return messages


def submit(question, context, other_info):
    prompt = get_prompt(question, context, other_info)

    outputs = pipe(
        prompt,
        max_new_tokens=1024,
    )
    x = outputs[0]["generated_text"][-1]
    y = x['content']
    return y

# Test

In [24]:
question = """"Can you summarize the statement of work for me in a few sentences?
        Specifically I'd like to know what is being acquired. Which software from what company?"""
response = submit(question, docx_text, pdf_text)
print('RESPONSE 1:')
print(response)


question = """"Does the Statement of Work fall under NDAA 2019 definition of AI? Explain."""
response = submit(question, docx_text, pdf_text)
print('RESPONSE 2:')
print(response)


question = """"Summarize the Statement of Work in 10 sentences."""
response = submit(question, docx_text, pdf_text)
print('RESPONSE 3:')
print(response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RESPONSE 1:
The ADVAN program is acquiring software and services from the Department of State (DOS) to support the development and use of artificial intelligence (AI) and machine learning (ML) technologies across the Department of Homeland Security (DHS).


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


RESPONSE 2:
The Statement of Work falls under the NDAA 2019 definition of AI as it includes any artificial system that performs tasks under varying and unpredictable circumstances without significant human oversight, or that can learn from experience and improve performance when exposed to data sets, as defined in section 238(g) of the NDAA Fiscal Year 2019.
RESPONSE 3:
Here is a summary of the Statement of Work in 10 sentences:

The Department of Homeland Security (DHS) Office of the Chief Information Officer (OCIO) is launching the Artificial Intelligence/Machine Learning (AI/ML) Service Portal, also known as PAMS. The program aims to facilitate the development and use of AI, ML, and data analytics across DHS by providing a centralized hub of automated services. The program will provide custom AI, ML, and data analytic design and development, automated analytic services, and documentation, repositories, and tutorials. The ADVAN program will support the development of AI, ML, and data