In [None]:
import pandas as pd
from txtai.embeddings import Embeddings
import re
import unicodedata
import string

def remove_punctuation(text:str) -> str:
    return ''.join([c for c in text if c not in string.punctuation])

def remove_accented_chars(text: str) -> str:
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def remove_numbers(text: str) -> str:
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)

def remove_special_characters(text: str) -> str:
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

## Combine Goals and Objectives (with titles) into one paragraph and make comparison

In [None]:
go = pd.read_csv("./data/processed/goals-objectives-combined.csv", index_col="goal_id")
go.sort_index(inplace=True)
go.reset_index(inplace=True)
go

In [None]:
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})
embeddings.index([(uid, text, None) for uid, text in enumerate(go.goal_objective.tolist())])

In [None]:
results = pd.DataFrame([(query_id, query, go.iloc[embeddings.search(query, 1)[0][0]]['goal_name'], query_description) for _, query_id, query, query_description in go.to_records()])
results.columns = ["query_id", "query", "resul_goalname", "query_description"]
for _, query_id, query, query_description in go.to_records():
    print(f"{query_id} | {query} | {go.iloc[embeddings.search(query, 1)[0][0]]['goal_name']}")
results.to_csv("./results/goal-objectives-combined-results.csv", index=False)

In [None]:
survey = pd.read_excel("./resources/JCAT Export NIPR (APR-15-2022).xlsx")
capabilities = pd.DataFrame(survey[survey.columns[-5:-3]])
capabilities.columns = ["general_comments", "capability_description"]
capabilities["capability_description"] = capabilities["capability_description"].apply(lambda x: remove_special_characters(x))
capabilities["capability_description"] = capabilities["capability_description"].apply(lambda x: remove_accented_chars(x))
capabilities.head()

In [None]:
results = []
for _, _, capability in capabilities.to_records():
    result_id, score = embeddings.search(capability, 1)[0]
    results.append((result_id, score, capability, *go.loc[result_id,['goal_name', 'goal_objective']].values.tolist()))

results = pd.DataFrame(results, columns=["result_id", "score", "capability_query", "goal_name", "goal_objective_description"])
results.head()
results.to_csv("./results/survey-alignment.csv", index=False)