# GC JADC2 POC

- Manually pull the **Description** for each **Goal and Objectives** in [Section 2.2](./resources/DOD-DIGITAL-MODERNIZATION-STRATEGY-2019.PDF) into a [spreadsheet](./data/goals-and-objectives.csv)
- Use [`txtai`](https://github.com/neuml/txtai) for this prototype
- Use the answers to **Question R** as "queries" and see how it looks

## Goals & Objectives Format
Each DoD CIO goal is presented with the following component parts: 

- Goal:
  - A **Description** of the goal, including what it encompasses
  - The **Mission Impact** on the Department resulting from achievement of the objectives for that goal 
- Objective:
  - A **Description** that provides a rationale for the work and describes what the objective will accomplish
  - Each objective is further decomposed into the **Strategy Elements** that describe the specific, focused initiatives needed to accomplish that particular objective 


In [None]:
import pandas as pd
import re
import string
import json
from IPython.display import display, HTML

RESET = False

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_colwidth', 999)
pd.set_option('display.expand_frame_repr', True)

def pretty_print(df:pd.DataFrame):
    display(HTML(df.to_html().replace("\\n","<br>")))

def print_data_summary(df: pd.DataFrame, info:bool = False, num_rows:int = 5, seed:int = 0) -> None:
    num_samples, dim = df.shape
    print(f"{num_samples = }, {dim = }")
    if info:
        display(df.info())
    pretty_print(df.sample(num_rows, random_state=seed))

def clean_text(text:str) -> str:
    """Remove links, remove punctuation, and remove carriage returns."""
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\n", "", text)
    return text

In [None]:
if RESET:
    with open("./data/goals-and-objectives.txt","r") as f:
        raw_text = f.read()

    goals = re.findall(r"Goal \d: [\w ]+", raw_text)
    goals = [g.strip() for g in goals]
    print(goals)

    g_desc = re.compile(r"Description: (.*) Mission Impact:")
    final_dict= {i: {} for i in range(len(goals))}
    final_list = []

    for idx in range(len(goals)-1):
        end = raw_text.index(f"{goals[idx+1]}")
        start = raw_text.index(f"{goals[idx]}") + len(goals[idx])
        goal_text = raw_text[start:end]
        temp = re.sub(r"\n+", "", goal_text).strip()
        description = g_desc.search(temp).group(1)
        
        o_texts = re.findall(r"Objective \d+: [\w|( +)|-]+", temp, flags=re.DOTALL)
        objectives = {i: {} for i in range(len(o_texts))}
        obj_ist = []
        for o_idx in range(len(o_texts)-1):
            end = temp.index(f"{o_texts[o_idx+1]}")
            start = temp.index(f"{o_texts[o_idx]}") + len(o_texts[o_idx])
            objectives[o_idx] = {"name": o_texts[o_idx], "description": temp[start+2:end].encode('ascii', errors='ignore').strip().decode('ascii')}
            obj_ist.append(objectives[o_idx])
        objectives[len(o_texts)-1] = {"name": o_texts[len(o_texts)-1], "description": temp[(temp.index(f"{o_texts[-1]}") + len(o_texts[-1]))+2:].encode('ascii', errors='ignore').strip().decode('ascii')}
        obj_ist.append(objectives[len(o_texts)-1])

        final_dict[idx] = {"name": re.findall(r"Goal \d: ([\w ]+)",goals[idx])[0], "description": clean_text(description).encode('ascii', errors='ignore').strip().decode('ascii'), "objectives": obj_ist}
        final_list.append(final_dict[idx])

    start = raw_text.index(f"{goals[-1]}") + len(goals[-1])
    goal_text = raw_text[start:]
    temp = re.sub(r"\n+", "", goal_text).strip()
    description = g_desc.search(temp).group(1)

    o_texts = re.findall(r"Objective \d+: [\w|( +)|-]+", temp, flags=re.DOTALL)
    objectives = {i: {} for i in range(len(o_texts))}
    obj_ist = []
    for o_idx in range(len(o_texts)-1):
        end = temp.index(f"{o_texts[o_idx+1]}")
        start = temp.index(f"{o_texts[o_idx]}") + len(o_texts[o_idx])
        objectives[o_idx] = {"name": o_texts[o_idx], "description": temp[start+2:end].encode('ascii', errors='ignore').strip().decode('ascii')}
        obj_ist.append(objectives[o_idx])
    objectives[len(o_texts)-1] = {"name": o_texts[len(o_texts)-1], "description": temp[(temp.index(f"{o_texts[-1]}") + len(o_texts[-1]))+2:].encode('ascii', errors='ignore').strip().decode('ascii')}
    obj_ist.append(objectives[len(o_texts)-1])

    final_dict[len(goals)-1] = {"name": re.findall(r"Goal \d: ([\w ]+)",goals[-1])[0], "description": clean_text(description).encode('ascii', errors='ignore').strip().decode('ascii'), "objectives": obj_ist}
    final_list.append(final_dict[len(goals)-1])

    with open("./data/goals-and-objectives.json","w+") as f:
        json.dump(final_dict, f)

    goals_descriptions = pd.DataFrame([(final_list[o]["name"],final_list[o]["description"]) for o in range(len(final_list))], columns=["goal_name", "goal_description"])
    goals_descriptions.reset_index(inplace=True)

    objectives = pd.DataFrame([])
    for o_idx in range(len(final_list)):
        objective = pd.json_normalize(final_list[o_idx]["objectives"], max_level=0)
        objective["index"] = o_idx
        objective.set_index("index", inplace=True)
        objective.reset_index(inplace=True)
        objectives = pd.concat([objectives, objective], axis=0)
    data = goals_descriptions.merge(objectives)
    data.columns = ['index', 'goal_name', 'goal_description', 'objective_name', 'objective_description']
    data.to_csv("./data/dms-go-2019.csv", index=False)

    del objectives, objective, goals_descriptions, o_idx

## Embeddings

### Load Goals and Objectives Descriptions for DoD

In [None]:
strategy_data = pd.read_csv("./data/dms-go-2019.csv", index_col=0)
print_data_summary(strategy_data, num_rows=1)

### Load Survey Capability Descriptions

In [None]:
survey = pd.read_excel("./resources/JCAT Export NIPR (APR-15-2022).xlsx")
capabilities = pd.DataFrame(survey[survey.columns[-4]])
capabilities.columns = ["capability_description"]
print_data_summary(capabilities, num_rows=1)

In [None]:
from txtai.embeddings import Embeddings

# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

In [None]:
# Create an index for the list of text
embeddings.index([(uid, text, None) for uid, text in enumerate(strategy_data.goal_description.tolist())])