Heavily inspired by the lesson 
https://learn.deeplearning.ai/langchain-chat-with-your-data/lesson/2/document-loading
I want to:
1. Get the contents of a git repo with lesson material (in github-pages).
2. Add the files with content (for now only the markdown files) and create a vector store.
3. Be able to chat with the material, ask questions like: 'Tell me about the concept blablathisnthat ...', or 'Ask a question about ...'. 

In [None]:
# These should already be handled by the requirements.txt
#!pip install python-dotenv
#!pip install pypdf 
#!pip install markdown
#!pip install openai
#!pip install GitPython
#!pip install bs4
#!pip list

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.document_loaders import PyPDFLoader
#loader = PyPDFLoader("samples/tutorial_Class.pdf")
loader = PyPDFLoader("samples/tutorial_Class.pdf")
pages = loader.load()

In [3]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/37signals-is-you.md")

In [4]:
pages[0]

In [6]:
# Git Clone 
import subprocess

repo_url = "https://github.com/stasemsoft/softwarematerial.git"
local_dir = ".git_softwaremat"

subprocess.check_call(["git", "clone", repo_url, local_dir])


Cloning into '.git_softwaremat'...


0

In [10]:
import pathlib

repo_path = pathlib.Path(local_dir)
markdown_files = list(repo_path.glob("**/*.md"))
markdown_files[0:4]


[PosixPath('.git_softwaremat/README.md'),
 PosixPath('.git_softwaremat/objects.md'),
 PosixPath('.git_softwaremat/docs/legacy/Explanation-Array-Lists.md'),
 PosixPath('.git_softwaremat/docs/legacy/readme.md.md')]

In [11]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [16]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

def SplitMarkdownText(markdown_text):
    md_header_splits = markdown_splitter.split_text(markdown_text)
    return md_header_splits


In [18]:
for markdown_file in markdown_files[0:3]:
    with open(markdown_file, "r") as f:
        print("--------------------")
        content = f.read()
        w = SplitMarkdownText(content)
        print(w)
        print("--+-----+------+----")


--------------------
[Document(page_content='Dit is een verzameling (open) software-materiaal van het startsemester FHICT (Fontys Hogeschool voor ICT). Het is openbaar toegankelijk voor iedereen en vrij van copyright. Het is in de loop der jaren gemaakt en verzameld door docenten, met hier en daar toevoegingen door studenten! FHICT verwijst naar dit materiaal vanuit een canvas-course, daar staat de FHICT-specifieke info, terwijl deze github-pages redelijk FHICT-onafhankelijk zijn.  \n---', metadata={'Header 1': 'Softwarematerial - Overzicht', 'Header 2': 'Inleiding'}), Document(page_content="In het theorie-materiaal onderscheiden we:\n+ `Tutorial`: practical steps, learning oriented.\n+ `How-to guide`: practical steps, problem oriented.\n+ `Discussion`: theoretical, understanding oriented.\n+ `Reference`: theoretical, information oriented. Accurate and complete.  \nDe opdrachten zijn te verdelen in:\n+ `Training` (voorheen exercises): 'vingeroefeningen', horen bij een stukje theorie om