In [1]:
import os
import glob
from datetime import datetime
import dotenv

import langchain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# load secrets from .env into environment variables
dotenv.load_dotenv()

langchain.__version__


'0.0.154'

In [2]:
# models_obj = openai.Model.list()
# sorted([m['id'] for m in models_obj['data']])

# add pinecone
# do the csv thing

In [3]:
# # doesn't work, maybe see https://stackoverflow.com/questions/66990912/import-error-cannot-import-name-open-filename-from-pdfminer-utils
# hftcdir = '/Users/drucev/notebooks/llama2/llama_index/examples/vector_indices/hftc'
# documents = DirectoryLoader(hftcdir).load()


In [5]:
hftcdir = '/Users/drucev/projects/question_answering_over_docs/hftc'

documents = None
count = 0
for f in glob.glob('%s/*' % hftcdir):
    count += 1
#     print(f)
    if documents is None:
        documents = TextLoader(f).load()
    else:
        documents.extend(TextLoader(f).load())

print(count)
len(documents)

21622


21622

In [6]:
# embedding_model = 'text-embedding-ada-002'   # default

# split the documents into chunks
print(1, 'splitting', datetime.now())
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# load embeddings into vector db
print(2, 'getting embeddings', datetime.now())
embeddings = OpenAIEmbeddings()

# create the vectorestore to use as the index
print(3, 'creating index', datetime.now())
persist_directory = 'chromadb'
vectordb = Chroma.from_documents(documents=texts, embedding=embeddings, persist_directory=persist_directory)
vectordb.persist()

# expose this index in a retriever interface
print(4, 'instantiate retriever', datetime.now())
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k":2})


1 splitting 2023-05-05 10:59:23.775477
2 getting embeddings 2023-05-05 10:59:24.340225


Using embedded DuckDB with persistence: data will be stored in: chromadb


3 creating index 2023-05-05 10:59:24.783059


FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

4 instantiate retriever 2023-05-05 11:03:39.111844


In [7]:
# load from local directory
embeddings = OpenAIEmbeddings()
persist_directory = 'chromadb'
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k":2})


Using embedded DuckDB with persistence: data will be stored in: chromadb


FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

In [8]:
# create a chain to answer questions
qa_model = 'gpt-3.5-turbo-0301'

llm = ChatOpenAI(model_name=qa_model, 
                 temperature=0.3,
                )

qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff",
                                 retriever=retriever,
                                 return_source_documents=True
                                )


In [9]:
query = "what are the names of some endpoint protection or endpoint detection and response or EDR products?"
result = qa({"query": query})
print(result['result'])


One endpoint protection product mentioned in the context is Crowdstrike. However, there are many other endpoint protection and EDR products available in the market, such as Carbon Black, SentinelOne, McAfee Endpoint Security, Symantec Endpoint Protection, Trend Micro Apex One, and many more.


In [10]:
query = "what are the names of mentioned Managed Service Providers or MSPs?"
result = qa({"query": query})
print(result['result'])


The mentioned Managed Service Providers or MSPs are AlphaServe, CDI, and Agio.


In [11]:
query = "what are the names of mentioned MDRs or MSSPs?"
result = qa({"query": query})
result['result']


'The context does not mention any specific MDRs or MSSPs by name. It provides criteria for selecting an MDR or MSSP, but it does not recommend any particular company.'

In [None]:
# result['source_documents'][0].page_content

In [None]:
# result['source_documents'][0].metadata

In [None]:
# !cat /Users/drucev/notebooks/llama2/llama_index/examples/vector_indices/hftc/20200805-iYejQDYtPG6DKsu3Qouy.txt
# result['source_documents'][1].metadata

## Grab emails from Gmail

In [None]:
from simplegmail import Gmail
from simplegmail.query import construct_query
from datetime import datetime

print(datetime.now())
gmail = Gmail()

# Unread messages in inbox with label "Work"
labels = gmail.list_labels()

# work_label = list(filter(lambda x: x.name == 'hedgefundtech', labels))[0]
# messages = gmail.get_unread_inbox(labels=[work_label])

# For even more control use queries:
# Messages that are: newer than 2 days old, unread, labeled "Finance" or both "Homework" and "CS"
query_params = {
    "labels":[["hedgefundtech"]]
}

messages = gmail.get_messages(query=construct_query(query_params))
print(len(messages))
print(datetime.now())

## Save emails

In [None]:
from datetime import datetime
import random
import string
import re

outdir = 'hftc/'

for message in messages:

    sendemail, subject, senddate, date_object = None, None, None, None
    try:
        # don't get or save sender
#         if 'X-Sender' in message.headers:
#             sendemail = message.headers['X-Sender']
#         if not sendemail:
#             sendemail =  message.headers['From']
#             match = re.search(r'<(.+?)>', sendemail)
#             if match:
#                 sendemail = match.group(1)
#         if not sendemail:
#             match = re.search(r'<(.+?)>', message.sender)
#             if match:
#                 sendemail = match.group(1)
#         if not sendemail:
#             sendemail =  message.sender
            
        subject = message.subject[16:] if message.subject else ''
        senddate = message.date
        date_object = datetime.strptime(senddate, '%Y-%m-%d %H:%M:%S%z')   
        # print(date_object, sendemail, subject)
        
        sendbody = message.plain[:1000] if message.plain else '' # or message.html

        rnd = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k=20))
        filename = datetime.strftime(date_object, "%Y%d%m") + '-' + rnd + '.txt'
#         print(filename)
        
        with open(outdir + filename, 'w') as outfile: 
            outfile.write('Date: %s\n' % senddate)
            outfile.write('Subject: %s\n\n' % subject)
            outfile.write(sendbody)
            
    except Exception as e:
        print(e)
    