In [None]:
import subprocess
import sys
import os
import requests
import time

def clone_or_update_github_docs(target_dir, repo_url):
    if os.path.exists(target_dir):
        print(f"Repository already cloned in '{target_dir}'. Pulling latest changes...")
        try:
            result = subprocess.run(
                ["git", "-C", target_dir, "pull"],
                check=True, capture_output=True, text=True
            )

            return result.stdout
        except subprocess.CalledProcessError as e:
            print(f"Error pulling latest changes: {e}")
            print(f"Error details: {e.stderr}")
            return False
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return False
    else:
        print("Cloning GitHub docs repository...")
        try:
            result = subprocess.run(
                [
                    "git",
                    "clone",
                    "--depth=1",
                    "--single-branch",
                    "--branch", "main",
                    repo_url,
                    target_dir,
                ],
                check=True, capture_output=True, text=True
            )
            print(f"Repository cloned successfully to '{target_dir}' directory!")
            return True
        except subprocess.CalledProcessError as e:
            print(f"Error cloning repository: {e}")
            print(f"Error details: {e.stderr}")
            return False
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return False


def copy_md_files(src_dir, dest_dir):

    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    for root, dirs, files in os.walk(src_dir):
        for file in files:
            if file.endswith('.md') and file not in ['index.md', 'README.md']:
                relative_path = os.path.relpath(root, src_dir)
                dest_subdir = os.path.join(dest_dir, relative_path)
                full_path = relative_path + "/" + file
                                    
                if not os.path.exists(dest_subdir):
                    os.makedirs(dest_subdir)
                
                if os.path.exists(dest_subdir + "/" + file):
                    print(f"File already exists: {file} in {dest_subdir}, skipping fetch.")
                    continue
                
                api = "https://docs.github.com/api/article/body?pathname=/en/" + full_path.replace('.md', '')
                
                result = requests.get(api)
                time.sleep(2)
                if result.status_code == 429:
                    print("Rate limit exceeded. Please try again later.")
                    sys.exit(1)
                if result.status_code == 200:
                    n = n + 1
                    with open(os.path.join(dest_subdir, file), 'w', encoding='utf-8') as f:
                        f.write(result.text)
                    print(f"Fetched and wrote: {full_path}")

def git_helper():
    repo_url = "https://github.com/github/docs.git"
    target_dir = "gh_docs"
    result = clone_or_update_github_docs(target_dir, repo_url)
    copy_md_files(f'{target_dir}/content', 'data')
    
if __name__ == "__main__":
    git_helper()

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
import os

class document_loader: 
    
    def load_documents(self, DATA_PATH="data"):
        documents = []
        
        for root, dirs, files in os.walk(DATA_PATH):
            for file in files:
                if file.endswith(".md"):
                    try:
                        file_path = os.path.join(root, file)
                        loader = TextLoader(file_path, encoding='utf-8')
                        documents.extend(loader.load())
                    except Exception as e:
                        print(f"Error loading {file}: {str(e)}")

        return documents

    def split_documents(self, documents):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=50,
            length_function=len,
            is_separator_regex=False,
        )
        return text_splitter.split_documents(documents)
    
doc_loader = document_loader()
documents = doc_loader.load_documents()
sliced_documents = doc_loader.split_documents(documents) 
print(sliced_documents[300])



* Eligibility refers to the ability of the node to work in the cluster and has two possible states: `eligible` or `ineligible`.

Node Eligibility Service provides a configurable TTL setting for two states, `warn` and `fail`.

* `warn`: The node has been offline for a short period of time. This may indicate something is wrong with the node and that administrators should investigate. The default setting is 15 minutes.
* `fail`: The node has been offline for a long period of time, and reintroduction into the cluster could cause performance issues due to resynchronization. The default setting is 60 minutes.

For each node, Node Eligibility Service determines health and eligibility for participation in the cluster in the following ways.

* If a node has been observed to be healthy, the health state is `healthy` and the eligibility state is `eligible`.
* If a node hasn't been observed to be healthy for longer than the `fail` TTL, the health state is `critical` and its eligibility state is `i