# Clone repo

In [4]:
import subprocess

def clone_github_repo(github_url, local_path):
    """
    Clone a GitHub repository to a local path.
    
    Args:
    github_url (str): The URL of the GitHub repository to clone.
    local_path (str): The local path where the repository should be cloned.
    
    Returns:
    tuple: A tuple containing a boolean indicating success or failure, and a string message.
    """
    try:
        result = subprocess.run(['git', 'clone', github_url, local_path], 
                                check=True, 
                                capture_output=True, 
                                text=True)
        return True, f"Repository cloned successfully to {local_path}"
    except subprocess.CalledProcessError as e:
        return False, f"Failed to clone repository: {e.stderr.strip()}"
    except Exception as e:
        return False, f"An error occurred: {str(e)}"

In [11]:
clone_github_repo("https://github.com/jayrodge/Multimodal-RAG-with-Llama-3.2.git", "C:/learning/autonomous_github/clone")

(True, 'Repository cloned successfully to C:/learning/autonomous_github/clone')

# Indexing

## Full-text indexing

Full-text indexing is efficient for keyword-based searches

In [2]:
import uuid
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from tqdm import tqdm
from langchain.document_loaders import DirectoryLoader, NotebookLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import nltk
import os
# from utils import clean_and_tokenize

In [3]:
def clone_github_repo(github_url, local_path):
    try:
        subprocess.run(['git', 'clone', github_url, local_path], check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone repository: {e}")
        return False

In [4]:
def load_and_index_files(repo_path):
    extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']

    file_type_counts = {}
    documents_dict = {}

    for ext in extensions:
        glob_pattern = f'**/*.{ext}'
        try:
            loader = None
            if ext == 'ipynb':
                loader = NotebookLoader(str(repo_path), include_outputs=True, max_output_length=20, remove_newline=True)
            else:
                loader = DirectoryLoader(repo_path, glob=glob_pattern)

            loaded_documents = loader.load() if callable(loader.load) else []
            if loaded_documents:
                file_type_counts[ext] = len(loaded_documents)
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, repo_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id

                    documents_dict[file_id] = doc
        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            continue

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

    split_documents = []
    for file_id, original_doc in documents_dict.items():
        split_docs = text_splitter.split_documents([original_doc])
        for split_doc in split_docs:
            split_doc.metadata['file_id'] = original_doc.metadata['file_id']
            split_doc.metadata['source'] = original_doc.metadata['source']

        split_documents.extend(split_docs)

    index = None
    if split_documents:
        tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]
        index = BM25Okapi(tokenized_documents)
    return index, split_documents, file_type_counts, [doc.metadata['source'] for doc in split_documents]

In [5]:
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter

def load_documents(repo_path):
    extensions = [
        'txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 
        'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 
        'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 
        'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb'
    ]

    file_type_counts = {}
    documents_dict = {}

    # Use tqdm.notebook for Jupyter compatibility
    for ext in tqdm(extensions, desc="Processing extensions"):
        glob_pattern = f'**/*.{ext}'
        try:
            loader = None
            if ext == 'ipynb':
                loader = NotebookLoader(
                    str(repo_path),
                    include_outputs=True,
                    max_output_length=20,
                    remove_newline=True
                )
            else:
                loader = DirectoryLoader(repo_path, glob=glob_pattern)

            loaded_documents = loader.load() if callable(loader.load) else []
            if loaded_documents:
                file_type_counts[ext] = len(loaded_documents)
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, repo_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id
                    documents_dict[file_id] = doc

        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            continue

    return file_type_counts, documents_dict

In [6]:
def index_documents(documents_dict):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=200
    )

    split_documents = []
    for file_id, original_doc in documents_dict.items():
        split_docs = text_splitter.split_documents([original_doc])
        for split_doc in split_docs:
            split_doc.metadata['file_id'] = original_doc.metadata['file_id']
            split_doc.metadata['source'] = original_doc.metadata['source']
        split_documents.extend(split_docs)

    index = None
    if split_documents:
        tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]
        index = BM25Okapi(tokenized_documents)

    document_sources = [doc.metadata['source'] for doc in split_documents]
    
    return index, split_documents, document_sources

In [7]:
# file_type_counts, documents_dict = load_documents(repo_path)
# index, split_documents, document_sources = index_documents(documents_dict)
# return index, split_documents, file_type_counts, document_sources

In [8]:
nltk.download("punkt")

def clean_and_tokenize(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\b(?:http|ftp)s?://\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return nltk.word_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\67830\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def search_documents(query, index, documents, n_results=5):
    query_tokens = clean_and_tokenize(query)
    bm25_scores = index.get_scores(query_tokens)

    # Compute TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer(tokenizer=clean_and_tokenize, lowercase=True, stop_words='english', use_idf=True, smooth_idf=True, sublinear_tf=True)
    tfidf_matrix = tfidf_vectorizer.fit_transform([doc.page_content for doc in documents])
    query_tfidf = tfidf_vectorizer.transform([query])

    # Compute Cosine Similarity scores
    cosine_sim_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    # Combine BM25 and Cosine Similarity scores
    combined_scores = bm25_scores * 0.5 + cosine_sim_scores * 0.5

    # Get unique top documents
    unique_top_document_indices = list(set(combined_scores.argsort()[::-1]))[:n_results]

    return [documents[i] for i in unique_top_document_indices]

In [9]:
clone_github_repo("https://github.com/jayrodge/Multimodal-RAG-with-Llama-3.2", "multimodal-RAG")

True

In [10]:
file_type_counts, documents_dict = load_documents("multimodal-RAG")

Processing extensions:   0%|          | 0/33 [00:00<?, ?it/s]

Error loading file multimodal-RAG\requirements.txt
Error loading file multimodal-RAG\README.md
Error loading file multimodal-RAG\app.py


Error loading files with pattern '**/*.txt': b"cannot read magic file `/usr/share/misc/magic' (Bad address)"
Error loading files with pattern '**/*.md': b"cannot read magic file `/usr/share/misc/magic' (Bad address)"
Error loading files with pattern '**/*.py': b"cannot read magic file `/usr/share/misc/magic' (Bad address)"


: 

In [1]:
!pip install python-magic --upgrade



## Vector Indexing

Vector indexing is a technique where vectors (often high-dimensional representations of data) are stored in a way that allows for fast similarity searches. Instead of using exact matching like in traditional full-text search, vector indexing allows you to search for approximate or nearest neighbor matches based on a distance metric, such as cosine similarity or Euclidean distance

### Generate Vector Representations

Classify the type of file in github repo

In [20]:
import os

# Function to classify files and exclude hidden/git-related files
def classify_local_files(repo_path):
    categorized_files = {"code": [], "docs": [], "config": []}
    exclude_patterns = ['.git', '.sample', 'HEAD']  # Add any unwanted patterns

    # Walk through all files in the local repo directory
    for root, dirs, files in os.walk(repo_path):
        # Exclude .git folder from the walk
        dirs[:] = [d for d in dirs if d not in ['.git']]
        
        for file_name in files:
            # Skip hidden and unwanted files based on patterns
            if any(pattern in file_name for pattern in exclude_patterns):
                continue
            
            # Get the full path to the file
            file_path = os.path.join(root, file_name)
            
            # Get file extension
            _, ext = os.path.splitext(file_name)
            
            # Classify based on extension
            if ext in ['.py', '.js', '.java', '.cpp']:
                categorized_files["code"].append(file_name)
            elif ext in ['.md', '.txt', '.rst']:
                categorized_files["docs"].append(file_name)
            elif file_name == 'requirements.txt' or ext in ['.yaml', '.json']:
                categorized_files["config"].append(file_name)
    
    return categorized_files

In [21]:
# Example usage
repo_path = "clone"
categorized_files = classify_local_files(repo_path)

print(categorized_files)

{'code': ['app.py', 'document_processors.py', 'utils.py'], 'docs': ['README.md', 'requirements.txt'], 'config': []}


Using Graphcodebert for code type file (.py,.js, .cpp etc)

In [12]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load GraphCodeBERT model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
model = RobertaModel.from_pretrained("microsoft/graphcodebert-base")

# Function to generate embeddings
def get_graphcode_embedding(code_snippet):
    inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach()  # Mean pooling to get the embedding
    return embedding



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from sentence_transformers import SentenceTransformer

text_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to generate embeddings for text files
def generate_text_embedding(text_snippet):
    return text_model.encode(text_snippet)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
def generate_vectors_for_repo(repo_path, categorized_files):
    vectors = {"code": [], "docs": [], "config": []}

    # Process code files
    for code_file in categorized_files["code"]:
        file_path = os.path.join(repo_path, code_file)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            code_content = f.read()
            code_embedding = get_graphcode_embedding(code_content)
            vectors["code"].append((code_file, code_embedding))

    # Process documentation files
    for doc_file in categorized_files["docs"]:
        file_path = os.path.join(repo_path, doc_file)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            doc_content = f.read()
            doc_embedding = generate_text_embedding(doc_content)
            vectors["docs"].append((doc_file, doc_embedding))

    # Process configuration files
    for config_file in categorized_files["config"]:
        file_path = os.path.join(repo_path, config_file)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            config_content = f.read()
            config_embedding = generate_text_embedding(config_content)
            vectors["config"].append((config_file, config_embedding))

    return vectors

In [22]:
# Example usage
repo_path = "clone"

# Assuming you have categorized the files using the previous classify_local_files function
categorized_files = classify_local_files(repo_path)

# Generate vectors
vectors = generate_vectors_for_repo(repo_path, categorized_files)

# Print the vectors for each file type
for category, file_vectors in vectors.items():
    print(f"Category: {category}")
    for file_name, embedding in file_vectors:
        print(f"File: {file_name}, Embedding shape: {embedding.shape}")

Category: code
File: app.py, Embedding shape: torch.Size([1, 768])
File: document_processors.py, Embedding shape: torch.Size([1, 768])
File: utils.py, Embedding shape: torch.Size([1, 768])
Category: docs
File: README.md, Embedding shape: (384,)
File: requirements.txt, Embedding shape: (384,)
Category: config


## Store Embedding in Vector DB

| **Category**  | **Vector Database**  | **Key Features**                                               | **Scalability**                          | **Best For**                                 |
|---------------|----------------------|----------------------------------------------------------------|------------------------------------------|----------------------------------------------|
| **Cloud-Based**  | **Pinecone**         | Managed service, high performance, real-time & batch updates    | Automatically scalable on the cloud      | Large-scale enterprise applications          |
|                 | **Weaviate (Cloud)** | AI-first, hybrid search (vector + keyword), managed cloud       | Automatically scalable                   | AI-powered apps needing hybrid search        |
|                 | **Qdrant Cloud**     | Real-time, high-throughput, low-latency similarity search       | Highly scalable on the cloud             | Real-time recommendation systems             |
|                 | **Vearch Cloud**     | Multi-modal search (text, image, etc.), fully managed           | Cloud-native scalability                 | Multi-modal data search                      |
| **Self-Hosted**  | **FAISS**            | Local, high-performance vector search, indexing algorithms      | Small/medium-scale (custom distributed possible) | Fast, customizable, local vector search       |
|                 | **Chroma DB**           | Distributed, multi-modal search, integrates with big data tools | Highly scalable (Kubernetes integration) | Large-scale, multi-modal, distributed search |
|                 | **Qdrant (Self-hosted)** | Open-source, real-time, RESTful API                             | Moderate-scale, extendable to distributed setups | Moderate-scale, real-time vector search       |
|                 | **Weaviate (Self-hosted)** | Hybrid vector + keyword search, schema-based data storage       | Suitable for medium/large projects       | Complex hybrid search apps                   |
|                 | **Vespa**            | Large-scale enterprise-grade vector + text retrieval            | Highly scalable                          | Large enterprises, recommendation systems    |
|                 | **Elasticsearch (k-NN Plugin)** | Combines keyword + vector search, open-source                   | Highly scalable                          | Text-heavy apps needing hybrid search        |


In [1]:
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import torch

class CodeVectorIndexer:
    def __init__(self, model_name='microsoft/codebert-base'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.index = None
        self.documents = []

    def encode(self, texts, batch_size=32):
        all_embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512)
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Use [CLS] token embedding
            all_embeddings.append(embeddings)
        return np.vstack(all_embeddings)

    def add_documents(self, documents):
        self.documents.extend(documents)
        embeddings = self.encode(documents)
        if self.index is None:
            self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings.astype('float32'))

    def search(self, query, k=5):
        query_vector = self.encode([query]).astype('float32')
        distances, indices = self.index.search(query_vector, k)
        return [(self.documents[i], distances[0][j]) for j, i in enumerate(indices[0])]

# Example usage
if __name__ == "__main__":
    indexer = CodeVectorIndexer()

    # Simulating content from GitHub repos
    documents = [
        "def train_model(data, labels):\n    model = RandomForestClassifier()\n    model.fit(data, labels)\n    return model",
        "class NeuralNetwork(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.fc1 = nn.Linear(784, 128)\n        self.fc2 = nn.Linear(128, 10)",
        "def preprocess_text(text):\n    tokens = word_tokenize(text.lower())\n    return [token for token in tokens if token not in stop_words]",
        "async def fetch_data(url):\n    async with aiohttp.ClientSession() as session:\n        async with session.get(url) as response:\n            return await response.json()",
        "SELECT repository_name, COUNT(*) as star_count\nFROM github_stars\nGROUP BY repository_name\nORDER BY star_count DESC\nLIMIT 10;"
    ]

    indexer.add_documents(documents)

    # Perform a search
    query = "machine learning model training"
    results = indexer.search(query)
    print(f"Search results for '{query}':")
    for doc, score in results:
        print(f"Score: {score:.4f}\nDocument: {doc}\n")