In [None]:
import subprocess
import sys
import os
import requests
import time

def clone_or_update_github_docs(target_dir, repo_url):
    if os.path.exists(target_dir):
        print(f"Repository already cloned in '{target_dir}'. Pulling latest changes...")
        try:
            result = subprocess.run(
                ["git", "-C", target_dir, "pull"],
                check=True, capture_output=True, text=True
            )

            return result.stdout
        except subprocess.CalledProcessError as e:
            print(f"Error pulling latest changes: {e}")
            print(f"Error details: {e.stderr}")
            return False
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return False
    else:
        print("Cloning GitHub docs repository...")
        try:
            result = subprocess.run(
                [
                    "git",
                    "clone",
                    "--depth=1",
                    "--single-branch",
                    "--branch", "main",
                    repo_url,
                    target_dir,
                ],
                check=True, capture_output=True, text=True
            )
            print(f"Repository cloned successfully to '{target_dir}' directory!")
            return True
        except subprocess.CalledProcessError as e:
            print(f"Error cloning repository: {e}")
            print(f"Error details: {e.stderr}")
            return False
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return False


def copy_md_files(src_dir, dest_dir):

    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    for root, dirs, files in os.walk(src_dir):
        for file in files:
            if file.endswith('.md') and file not in ['index.md', 'README.md']:
                relative_path = os.path.relpath(root, src_dir)
                dest_subdir = os.path.join(dest_dir, relative_path)
                full_path = relative_path + "/" + file
                                    
                if not os.path.exists(dest_subdir):
                    os.makedirs(dest_subdir)
                
                if os.path.exists(dest_subdir + "/" + file):
                    print(f"File already exists: {file} in {dest_subdir}, skipping fetch.")
                    continue
                
                api = "https://docs.github.com/api/article/body?pathname=/en/" + full_path.replace('.md', '')
                
                result = requests.get(api)
                time.sleep(2)
                if result.status_code == 429:
                    print("Rate limit exceeded. Please try again later.")
                    sys.exit(1)
                if result.status_code == 200:
                    n = n + 1
                    with open(os.path.join(dest_subdir, file), 'w', encoding='utf-8') as f:
                        f.write(result.text)
                    print(f"Fetched and wrote: {full_path}")

def git_helper():
    repo_url = "https://github.com/github/docs.git"
    target_dir = "gh_docs"
    result = clone_or_update_github_docs(target_dir, repo_url)
    copy_md_files(f'{target_dir}/content', 'data')
    
if __name__ == "__main__":
    git_helper()

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
import os

class document_loader: 
    
    def load_documents(self, DATA_PATH="data"):
        documents = []
        
        for root, dirs, files in os.walk(DATA_PATH):
            for file in files:
                if file.endswith(".md"):
                    try:
                        file_path = os.path.join(root, file)
                        loader = TextLoader(file_path, encoding='utf-8')
                        documents.extend(loader.load())
                    except Exception as e:
                        print(f"Error loading {file}: {str(e)}")

        return documents

    def split_documents(self, documents):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=5000,
            chunk_overlap=100,
            length_function=len,
            is_separator_regex=False,
        )
        return text_splitter.split_documents(documents)
    
doc_loader = document_loader()
documents = doc_loader.load_documents()
sliced_documents = doc_loader.split_documents(documents) 



In [None]:
from src.llama import llama

OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434")
EMBEDDING_MODEL = "granite-embedding:30m" # Find available models here https://ollama.com/library

llama = llama(OLLAMA_HOST, embedding_model=EMBEDDING_MODEL)
for doc in sliced_documents:
    embedding = llama.create_embedding(f"{doc}")

    if embedding:
        print("Embedding created successfully.")
        print(embedding[0:3])  # Print first 3 dimensions of the embedding
    else:
        print("Failed to create embedding.")

Model 'granite-embedding:30m' is already available in Ollama.
Embedding created successfully.
[-2.2131993770599365, 1.099496841430664, 1.0434622764587402, -2.072650909423828, -3.4092788696289062]
Embedding created successfully.
[-2.377617120742798, 0.9176928400993347, 2.2164714336395264, -0.3705403804779053, -1.9137279987335205]
Embedding created successfully.
[-2.4799137115478516, -0.21864092350006104, 2.0329155921936035, 0.5165010690689087, -1.6858561038970947]
Embedding created successfully.
[-2.447915554046631, 0.41641250252723694, 1.9665565490722656, -0.6106234788894653, -0.9111484289169312]
Embedding created successfully.
[-1.8158597946166992, 0.16138213872909546, -0.8361316919326782, 1.0978569984436035, -1.3828716278076172]
Embedding created successfully.
[-1.598333716392517, -0.859251856803894, 0.2641162574291229, -0.3486754894256592, -1.7314200401306152]
Embedding created successfully.
[-1.0819883346557617, -0.8011630773544312, 0.058554887771606445, 0.11944654583930969, -0.952

KeyboardInterrupt: 