In [1]:
!pip install -q -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# import required libraries and packages
import os
import glob
import pickle

import re
from pprint import pprint

import pandas as pd

from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA

from transformers import pipeline

In [3]:
# change working directory to root
os.chdir('../')
os.getcwd()

'/opt/app-root/src/ask_project_nexodus_docs'

## Create Validation Dataset

In [4]:
with open('data/Network-Training-QAs.md', 'r') as f:
    content = f.read()
    
content = content.split('\n')

section_title = []
questions = []
answers = []

for line in content:
    ## get section title
    # if line.startswith('###'):
    #     line = line.replace('###', '')
    #     section_title.append(line.strip())
    # get questions
    if '**Q**' in line:
        # remove question number 
        line = ''.join(char for char in line if not char.isdigit())
        # # remove **Q**
        line = line.replace('. **Q**: ', '')
        questions.append(line.strip())
    elif '**A**' in line:
        line = line.replace('**A**: ', '')
        answers.append(line.strip())
    else:
        continue

    
nexodus_qa_df = pd.DataFrame(list(zip(questions, answers)), columns = ['questions', 'answers'])
display(nexodus_qa_df.head())

Unnamed: 0,questions,answers
0,What is the purpose of the `ifconfig` command ...,The `ifconfig` command is used to configure th...
1,How would you add a static IP to a Linux machine?,You can add a static IP to a Linux machine by ...
2,What is WireGuard?,WireGuard is an open-source VPN solution that ...
3,How would you install WireGuard on a Linux mac...,"Typically, you would use a package manager suc..."
4,How do you check the current IP address of you...,You can use the `ip addr show` command to disp...


## Generative QA

In [5]:
generated_answers = []

text2text_generator = pipeline("text2text-generation", model="declare-lab/flan-alpaca-large")

for question in nexodus_qa_df['questions']:
    question = 'question: ' + question 
    generated_answer = text2text_generator(question, min_length=5, max_length=50)
    generated_answer = generated_answer[0]['generated_text']
    generated_answers.append(generated_answer)
    
nexodus_qa_df['generated_answers'] = generated_answers
display(nexodus_qa_df.head())

Unnamed: 0,questions,answers,generated_answers
0,What is the purpose of the `ifconfig` command ...,The `ifconfig` command is used to configure th...,The ifconfig command in Linux is used to confi...
1,How would you add a static IP to a Linux machine?,You can add a static IP to a Linux machine by ...,"To add a static IP to a Linux machine, you wil..."
2,What is WireGuard?,WireGuard is an open-source VPN solution that ...,WireGuard is a software application that prote...
3,How would you install WireGuard on a Linux mac...,"Typically, you would use a package manager suc...","To install WireGuard on a Linux machine, you w..."
4,How do you check the current IP address of you...,You can use the `ip addr show` command to disp...,"In Linux, you can check the current IP address..."


## Extractive QA

In [6]:
MD_DIR = 'data'
md_files = []

for file in glob.glob(os.path.join (MD_DIR, "docs/**/*.md"), recursive = True):
    if file.endswith('.md'):
        md_files.append(file)

In [7]:
# preprocess the documents
docs = [UnstructuredMarkdownLoader(f).load()[0] for f in md_files]

# # split it into chunks 
doc_chunks = CharacterTextSplitter(separator = "\n", chunk_size=1000, chunk_overlap=0).split_documents(docs)


print(f"Number documents: {len(docs)}")
print(f"Number chunks: {len(doc_chunks)}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Created a chunk of size 1971, which is longer than the specified 1000


Number documents: 41
Number chunks: 222


In [8]:
def prettify_chunks(doc_chunks):
    prettify_doc_chunks = []

    for chunk in doc_chunks:
        prettify_doc = chunk.page_content.lower()
        prettify_doc = re.sub(r'[^\w\s\n]', '', prettify_doc)
        prettify_doc = prettify_doc.split('\n')
        
        text = ' '.join(map(str, prettify_doc[1:]))
    
        doc = Document(page_content = text, 
                       metadata = {'source': chunk.metadata['source'], 'section_title': prettify_doc[0]}
                       )
        prettify_doc_chunks.append(doc)
    return prettify_doc_chunks

In [9]:
pretty_doc_chunks = prettify_chunks(doc_chunks)
display(pretty_doc_chunks[:5])

[Document(page_content='this guide will walk you through getting your first devices connected via nexodus install and start the nexodus agent fedora centos stream supported versions fedora 38 x86_64 aarch64 centos stream 9 x86_64 aarch64 sh enable the copr repository and install the nexodus package sudo dnf copr enable russellbnexodus sudo dnf install nexodus start the nexodus service and set it to automatically start on boot sudo systemctl start nexodus sudo systemctl enable nexodus  edit etcsysconfignexodus if you plan to use a nexodus service other than httpstrynexodusio query the status of nexd and follow the instructions to register your device sh sudo nexctl nexd status brew for mac you can install the nexodus agent via homebrew sh brew tap nexodusionexodus brew install nexodus to start the nexd agent and also have it start automatically on boot run sh sudo brew services start nexodus', metadata={'source': 'data/docs/quickstart.md', 'section_title': 'quick start'}),
 Document(pag

In [10]:
# initalize embedder 
embeddings = HuggingFaceEmbeddings()

# transform doc chunks into embeddings and store in vector db 
db = FAISS.from_documents(pretty_doc_chunks, embeddings)

# save db to local directory
db.save_local("models/nexodus_index.faiss")

In [11]:
# load db from local directory for inference
db = FAISS.load_local("models/nexodus_index.faiss", embeddings)

In [13]:
extracted_answers = []
source_titles = []
for question in nexodus_qa_df['questions']:
    top_match = db.similarity_search(question, k=1)
    extracted_answer = top_match[0].page_content
    source_title = top_match[0].metadata['source']
    extracted_answers.append(extracted_answer)
    source_titles.append(source_title)
    
# sanity check
display(len(extracted_answers))
display(len(source_titles))

100

100

In [14]:
nexodus_qa_df['extracted_answers'] = extracted_answers

display(nexodus_qa_df.head())

Unnamed: 0,questions,answers,generated_answers,extracted_answers
0,What is the purpose of the `ifconfig` command ...,The `ifconfig` command is used to configure th...,The ifconfig command in Linux is used to confi...,ipprotocol ipv4 proto frompor...
1,How would you add a static IP to a Linux machine?,You can add a static IP to a Linux machine by ...,"To add a static IP to a Linux machine, you wil...",this guide will walk you through getting your ...
2,What is WireGuard?,WireGuard is an open-source VPN solution that ...,WireGuard is a software application that prote...,relay node the relay needs to have v6 forwardi...
3,How would you install WireGuard on a Linux mac...,"Typically, you would use a package manager suc...","To install WireGuard on a Linux machine, you w...",relay node the relay needs to have v6 forwardi...
4,How do you check the current IP address of you...,You can use the `ip addr show` command to disp...,"In Linux, you can check the current IP address...",ipprotocol ipv4 proto frompor...


## Abstractive QA

In [15]:
# given the query and context, generate the answers
def provide_context(context):
    context = [f"<P> {m.page_content}" for m in context]
    context = " ".join(context)

    return context

In [16]:
llm = "deepset/roberta-base-squad2"
abstractive_answers = []

for question in nexodus_qa_df['questions']:
    top_3 = db.similarity_search(question, k=3)
    context = provide_context(top_3)
    text2text_generator = pipeline(task='question-answering', tokenizer=llm, model=llm)
    abstractive_answer = text2text_generator(question=question, context=context,temperature=1.5, min_length=20, max_length=50)
    abstractive_answers.append(abstractive_answer)
abstractive_answers[:5]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[{'score': 0.00027977206627838314,
  'start': 579,
  'end': 629,
  'answer': 'all rules are applied only to the driver interface'},
 {'score': 7.717112748650834e-05,
  'start': 2466,
  'end': 2497,
  'answer': 'sudo ip link del wg0 osxwindows'},
 {'score': 0.11282513290643692,
  'start': 1339,
  'end': 1353,
  'answer': 'tunneling mode'},
 {'score': 1.7082244312405237e-06,
  'start': 660,
  'end': 665,
  'answer': 'shell'},
 {'score': 0.0008120706188492477,
  'start': 1226,
  'end': 1237,
  'answer': 'ip_protocol'}]

In [17]:
abstracted_answers = [x['answer'] for x in abstractive_answers]
nexodus_qa_df['abstracted_answers'] = abstracted_answers
display(nexodus_qa_df.head())

Unnamed: 0,questions,answers,generated_answers,extracted_answers,abstracted_answers
0,What is the purpose of the `ifconfig` command ...,The `ifconfig` command is used to configure th...,The ifconfig command in Linux is used to confi...,ipprotocol ipv4 proto frompor...,all rules are applied only to the driver inter...
1,How would you add a static IP to a Linux machine?,You can add a static IP to a Linux machine by ...,"To add a static IP to a Linux machine, you wil...",this guide will walk you through getting your ...,sudo ip link del wg0 osxwindows
2,What is WireGuard?,WireGuard is an open-source VPN solution that ...,WireGuard is a software application that prote...,relay node the relay needs to have v6 forwardi...,tunneling mode
3,How would you install WireGuard on a Linux mac...,"Typically, you would use a package manager suc...","To install WireGuard on a Linux machine, you w...",relay node the relay needs to have v6 forwardi...,shell
4,How do you check the current IP address of you...,You can use the `ip addr show` command to disp...,"In Linux, you can check the current IP address...",ipprotocol ipv4 proto frompor...,ip_protocol


In [18]:
# save df to csv file
nexodus_qa_df.to_csv('data/results/nexodus_qa_df.csv')