Heavily inspired by the lesson 
https://learn.deeplearning.ai/langchain-chat-with-your-data/lesson/2/document-loading
I want to:
1. Get the contents of a git repo with lesson material (in github-pages).
2. Add the files with content (for now only the markdown files) and create a vector store.
3. Be able to chat with the material, ask questions like: 'Tell me about the concept blablathisnthat ...', or 'Ask a question about ...'. 

In [4]:
# These should already be handled by the requirements.txt
#!pip install python-dotenv
#!pip install pypdf 
#!pip install markdown
#!pip install openai
#!pip install GitPython
#!pip install bs4
#!pip list

In [1]:
import os
import openai
import sys
sys.path.append('../..')


from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [2]:
import langchain

langchain.verbose = True
langchain.debug = True

In [3]:
from langchain.document_loaders import PyPDFLoader
#loader = PyPDFLoader("samples/tutorial_Class.pdf")
pdfLoader = PyPDFLoader("samples/tutorial_Class.pdf")
pages = pdfLoader.load()


In [9]:
# alternative: url to a markdown file
from langchain.document_loaders import WebBaseLoader

webLoader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/37signals-is-you.md")


In [8]:
pages[0]

Document(page_content='Stéphane Ducasse, Lukas Renggli, David C. Shaffer, Rick Zacconewith Michael DaviesDynamic WebDevelopmentwith', metadata={'source': 'samples/tutorial_Class.pdf', 'page': 0})

In [6]:
# Git Clone 
import subprocess

repo_url = "https://github.com/stasemsoft/softwarematerial.git"
local_dir = ".git_softwarematerial"

subprocess.check_call(["git", "clone", repo_url, local_dir])


Cloning into '.git_softwarematerial'...


0

In [8]:
import pathlib

repo_path = pathlib.Path(local_dir)
markdown_files = list(repo_path.glob("**/*.md"))
markdown_files[0:4]


[PosixPath('.git_softwarematerial/README.md'),
 PosixPath('.git_softwarematerial/objects.md'),
 PosixPath('.git_softwarematerial/docs/legacy/Explanation-Array-Lists.md'),
 PosixPath('.git_softwarematerial/docs/legacy/readme.md.md')]

In [9]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
import numpy as np

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

# 1 embedding to put all text in. 
embedding = OpenAIEmbeddings()

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)


def SplitMarkdownText(markdown_text,embeddings):
    md_header_splits = markdown_splitter.split_text(markdown_text)
    print(f"1.{type(md_header_splits)}")
    for item in md_header_splits:
        print(f"2.{type(item)}")
        embedding.append(embedding.embed_query(item))
    print(np.dot(embeddings[0:], embeddings[:0]))
    return md_header_splits


In [None]:
embeddings = []

for markdown_file in markdown_files[0:3]:
    with open(markdown_file, "r") as f:
        print("--file--------------------")
        content = f.read()
        # chunk = SplitMarkdownText(content,embeddings) # inlined here: 
        # --- 
        md_header_splits = markdown_splitter.split_text(content)
        aDoc = md_header_splits[0]
        x = embedding.embed_query(aDoc.page_content)
        embeddings.append(x)
        print(np.dot(embeddings[0:], embeddings[:0]))
        # --- 


In [None]:
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings

embs = OpenAIEmbeddings()
text = "sample Q"
doc = Document(page_content=text)
query_result = embs.embed_query(doc)

print(len(query_result))

# Seems that the principle works here, but I don't have a paid version, os my quotum does not allow me to *really* try it. 