In [1]:
import requests
import os
from dotenv import load_dotenv

load_dotenv()

token = os.getenv("GITHUB_TOKEN")
username = "cleavestone"

headers = {"Authorization": f"token {token}"}
url = f"https://api.github.com/users/{username}/repos"

response = requests.get(url, headers=headers)
repos = response.json()

for repo in repos:
    print(repo["name"], repo["html_url"])


Air_quality_Forecasting https://github.com/cleavestone/Air_quality_Forecasting
CARLIFORNIA-HOUSE-PREDICTION https://github.com/cleavestone/CARLIFORNIA-HOUSE-PREDICTION
ChestXray-Cancer-Detection https://github.com/cleavestone/ChestXray-Cancer-Detection
Customer-Segmentation https://github.com/cleavestone/Customer-Segmentation
Customer_Churn_MLOPS https://github.com/cleavestone/Customer_Churn_MLOPS
DATA-ANALYSIS-SQL https://github.com/cleavestone/DATA-ANALYSIS-SQL
Data-Analytics https://github.com/cleavestone/Data-Analytics
Data-Insights-Hub https://github.com/cleavestone/Data-Insights-Hub
ETL https://github.com/cleavestone/ETL
fine_tuning_bert_model https://github.com/cleavestone/fine_tuning_bert_model
MCQ_GEN https://github.com/cleavestone/MCQ_GEN
Medical-chatbot https://github.com/cleavestone/Medical-chatbot
mlflow-production-setup https://github.com/cleavestone/mlflow-production-setup
Mlprojects https://github.com/cleavestone/Mlprojects
Movie_recommender https://github.com/cleavesto

### Fetch README for Each Repo

In [2]:
for repo in repos[:3]:
    repo_name = repo["name"]
    readme_url = f"https://api.github.com/repos/{username}/{repo_name}/readme"
    res = requests.get(readme_url, headers=headers)

    if res.status_code == 200:
        readme_data = res.json()
        import base64
        content = base64.b64decode(readme_data["content"]).decode("utf-8")
        print(f"README for {repo_name}:\n", content[:300], "\n---\n")
    else:
        print(f"No README found for {repo_name}")


README for Air_quality_Forecasting:
 # Air_quality_Forecasting
![](https://github.com/cleavestone/Air_quality_Forecasting/blob/main/forecast.png)
## Introduction
This project focuses on implementing a Long Short-Term Memory (LSTM) neural network for multivariate time series forecasting. The goal is to predict the PM2.5 concentration, a 
---

README for CARLIFORNIA-HOUSE-PREDICTION:
 # CARLIFORNIA-HOUSE-PREDICTION

br>

<img src="https://github.com/cleavestone/CARLIFORNIA-HOUSE-PREDICTION/blob/main/static/image3.jpg" alt="Image description" width="1200" height="1000">

<br>

The US Census Bureau has published California Census Data which has 10 types of metrics such as the popul 
---

README for ChestXray-Cancer-Detection:
 # 🫁 Chest X-ray Cancer Classification (Normal vs Adenocarcinoma)

This repository contains an **end-to-end Deep Learning project** for classifying **chest X-rays** into **Normal** and **Adenocarcinoma** categories.  
The project is designed with a **modular, producti

In [3]:
import os
import requests
import base64

os.makedirs("data/github_readmes", exist_ok=True)

for repo in repos:
    repo_name = repo["name"]
    repo_url = repo["html_url"]
    readme_url = f"https://api.github.com/repos/{username}/{repo_name}/readme"
    res = requests.get(readme_url, headers=headers)

    if res.status_code == 200:
        readme_data = res.json()
        content = base64.b64decode(readme_data["content"]).decode("utf-8")

        # Prepend repo URL
        content_with_url = f"# Repository: {repo_name}\nURL: {repo_url}\n\n---\n\n{content}"

        with open(f"data/github_readmes/{repo_name}.md", "w", encoding="utf-8") as f:
            f.write(content_with_url)


### Regex Cleaning

In [1]:
import re

def clean_readme(text: str) -> str:
    # Remove image badges (shields.io, CI/CD)
    text = re.sub(r"!\[.*?\]\(.*?\)", "", text)

    # Remove HTML comments
    text = re.sub(r"<!--.*?-->", "", text, flags=re.S)

    # Remove code blocks (``` ... ```)
    text = re.sub(r"```.*?```", "", text, flags=re.S)

    # Remove long URLs (just noise in many cases)
    text = re.sub(r"http\S+", "", text)

    # Collapse extra whitespace/newlines
    text = re.sub(r"\n\s*\n", "\n\n", text)

    return text.strip()


In [3]:
import os

# Input + Output folders
input_folder = "data/github_readmes"
output_folder = "data/cleaned_readmes"
os.makedirs(output_folder, exist_ok=True)

# Loop over all markdown files
for file_name in os.listdir(input_folder):
    if file_name.endswith(".md"):
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, file_name)

        with open(input_path, "r", encoding="utf-8") as f:
            raw_text = f.read()

        cleaned_text = clean_readme(raw_text)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        print(f"✅ Cleaned: {file_name}")

✅ Cleaned: Air_quality_Forecasting.md
✅ Cleaned: CARLIFORNIA-HOUSE-PREDICTION.md
✅ Cleaned: ChestXray-Cancer-Detection.md
✅ Cleaned: Customer-Segmentation.md
✅ Cleaned: Customer_Churn_MLOPS.md
✅ Cleaned: DATA-ANALYSIS-SQL.md
✅ Cleaned: Data-Analytics.md
✅ Cleaned: Data-Insights-Hub.md
✅ Cleaned: ETL.md
✅ Cleaned: fine_tuning_bert_model.md
✅ Cleaned: MCQ_GEN.md
✅ Cleaned: Medical-chatbot.md
✅ Cleaned: mlflow-production-setup.md
✅ Cleaned: Mlprojects.md
✅ Cleaned: Movie_recommender.md
✅ Cleaned: Movie_Recommender_App.md
✅ Cleaned: Northwind-Sales-Analysis.md
✅ Cleaned: portfolio.md
✅ Cleaned: RAG_from_scratch.md


In [21]:
import os
import json
import requests

def summarize_for_embeddings(repo_name, repo_url, text):
    # Check if API key exists
    api_key = os.getenv('GROQ_API_KEY')
    if not api_key:
        return {
            "repo_name": repo_name,
            "repo_url": repo_url,
            "error": "GROQ_API_KEY not found in environment variables"
        }
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    prompt = f"""
You are preparing project data for a Retrieval-Augmented Generation (RAG) system.
The input is a README file that may contain tables, images, code, or badges.
Your task is to extract the most important structured information.

Guidelines:
- Summarize into 2–4 sentences: purpose, approach, and why it matters.
- Extract `key_skills` (methods, ML techniques, analytical skills).
- Extract `tech_stack` (frameworks, libraries, tools, languages).
- Suggest 2 realistic `use_cases` based on the project's purpose.
- Assign `complexity_level` as Beginner, Intermediate, or Advanced.
- Extract `tags` to help categorize projects (e.g., "machine learning", "time series", "kmeans clustering").
- If information is missing, set it to "Unknown".
- Ignore irrelevant details like install instructions, badges, license, or author credits.

⚠️ Output must be ONLY valid JSON, no markdown, no commentary.

Required fields:
- repo_name: {repo_name}
- repo_url: {repo_url}
- description
- key_skills (list)
- tech_stack (list)
- use_cases (list)
- complexity_level
- tags (list)

README:
{text[:6000]}
"""

    data = {
        "model": "llama-3.3-70b-versatile",
        "messages": [
            {"role": "system", "content": "You are a precise summarizer that outputs only valid JSON."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.2
    }

    try:
        response = requests.post(
            "https://api.groq.com/openai/v1/chat/completions",
            headers=headers,
            json=data,
            timeout=30  # Add timeout to prevent hanging
        )
        response.raise_for_status()  # Raise exception for bad status codes
        resp_json = response.json()
    except requests.exceptions.RequestException as e:
        return {
            "repo_name": repo_name,
            "repo_url": repo_url,
            "error": f"Request failed: {str(e)}"
        }
    except json.JSONDecodeError:
        return {
            "repo_name": repo_name,
            "repo_url": repo_url,
            "error": "Invalid JSON response from API"
        }

    if "error" in resp_json:
        return {"repo_name": repo_name, "repo_url": repo_url, "error": resp_json["error"]}

    if "choices" not in resp_json:
        return {"repo_name": repo_name, "repo_url": repo_url, "error": "No choices in response"}

    raw_output = resp_json["choices"][0]["message"]["content"].strip()

    # Remove markdown code blocks if present
    if raw_output.startswith("```"):
        raw_output = raw_output.split("```")[1]
        if raw_output.startswith("json"):
            raw_output = raw_output[4:].strip()

    try:
        summary = json.loads(raw_output)
    except json.JSONDecodeError:
        # fallback: best-effort extraction
        summary = {
            "repo_name": repo_name,
            "repo_url": repo_url,
            "description": raw_output,
            "key_skills": [],
            "tech_stack": [],
            "use_cases": [],
            "complexity_level": "Unknown",
            "tags": []
        }

    # Ensure required fields exist
    summary.setdefault("repo_name", repo_name)
    summary.setdefault("repo_url", repo_url)
    summary.setdefault("description", "Unknown")
    summary.setdefault("key_skills", [])
    summary.setdefault("tech_stack", [])
    summary.setdefault("use_cases", [])
    summary.setdefault("complexity_level", "Unknown")
    summary.setdefault("tags", [])

    return summary

In [None]:
import time
import json
from dotenv import load_dotenv


input_folder="data/github_readmes"
output_folder="data/summarized_readmes"
os.makedirs(output_folder, exist_ok=True)
for i, file_name in enumerate(os.listdir(input_folder)):
    if file_name.endswith(".md"):
        repo_name = file_name.replace(".md", "")
        repo_url = f"https://github.com/cleave/{repo_name}"

        with open(os.path.join(input_folder, file_name), "r", encoding="utf-8") as f:
            text = f.read()

        summary = summarize_for_embeddings(repo_name, repo_url, text)

        with open(os.path.join(output_folder, repo_name + ".json"), "w", encoding="utf-8") as f:
            json.dump(summary, f, indent=2)

        print(f"✅ Summarized: {repo_name}")

        # Sleep to avoid rate limit
        time.sleep(6)  # adjust depending on TPM limit



✅ Summarized: Air_quality_Forecasting
✅ Summarized: CARLIFORNIA-HOUSE-PREDICTION
✅ Summarized: ChestXray-Cancer-Detection
✅ Summarized: Customer-Segmentation
✅ Summarized: Customer_Churn_MLOPS
✅ Summarized: DATA-ANALYSIS-SQL
✅ Summarized: Data-Analytics
✅ Summarized: Data-Insights-Hub
✅ Summarized: ETL
✅ Summarized: fine_tuning_bert_model
✅ Summarized: MCQ_GEN
✅ Summarized: Medical-chatbot
✅ Summarized: mlflow-production-setup
✅ Summarized: Mlprojects
✅ Summarized: Movie_recommender
✅ Summarized: Movie_Recommender_App
✅ Summarized: Northwind-Sales-Analysis
✅ Summarized: portfolio
✅ Summarized: RAG_from_scratch


### Merge Jsons

In [24]:
import os
import json

def combine_jsons_to_jsonl(input_folder="data/summarized_readmes", output_file="data/summarized_readmes.jsonl"):
    with open(output_file, "w", encoding="utf-8") as outfile:
        for filename in os.listdir(input_folder):
            if filename.endswith(".json"):
                file_path = os.path.join(input_folder, filename)
                with open(file_path, "r", encoding="utf-8") as infile:
                    try:
                        data = json.load(infile)  # load each json file
                        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")
                    except json.JSONDecodeError:
                        print(f"⚠️ Skipping {filename}, invalid JSON")

    print(f"✅ Combined JSONL written to {output_file}")


In [25]:
combine_jsons_to_jsonl()

✅ Combined JSONL written to data/summarized_readmes.jsonl


In [26]:
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [35]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load .env
load_dotenv()

api_key = os.getenv("PINECONE_API_KEY")
if api_key is None:
    raise ValueError("Missing Pinecone API key. Did you set it in your .env?")

pc = Pinecone(api_key=api_key)

index_name = "proj-index"

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Match SentenceTransformer embedding size
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    #pc.describe_index(index_name).wait_until_ready()

# Connect to the index
index = pc.Index(index_name)
print(f"Connected to Pinecone index: {index_name}")

Connected to Pinecone index: proj-index


### Load your cleaned JSONL

In [28]:
docs = []
with open("data/summarized_readmes.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        docs.append(json.loads(line))

In [29]:
docs[0]

{'repo_name': 'Air_quality_Forecasting',
 'repo_url': 'https://github.com/cleavestone/Air_quality_Forecasting',
 'description': 'This project implements a Long Short-Term Memory (LSTM) neural network for multivariate time series forecasting to predict PM2.5 concentration using historical environmental data.',
 'key_skills': ['Time series forecasting',
  'LSTM neural networks',
  'Feature engineering',
  'Data preprocessing',
  'Model evaluation'],
 'tech_stack': ['Keras', 'Python', 'Min-Max scaling'],
 'use_cases': ['Air quality prediction for environmental management',
  'Time series forecasting for climate change research'],
 'complexity_level': 'Intermediate',
 'tags': ['machine learning',
  'time series',
  'LSTM',
  'air quality forecasting']}

### Flatten documents (combine description + use_cases etc.)

In [30]:
def prepare_text_for_embedding(doc):
    """
    Converts a project dictionary into a single text string optimized for embedding.
    
    Args:
        doc (dict): Project metadata dictionary
        
    Returns:
        str: Formatted text ready for embedding
    """
    parts = []
    
    # Add project name and description
    parts.append(f"Project: {doc.get('repo_name', 'Unknown')}")
    parts.append(f"Description: {doc.get('description', 'No description available')}")
    
    # Add key skills
    skills = doc.get('key_skills', [])
    if skills:
        parts.append(f"Key Skills: {', '.join(skills)}")
    
    # Add tech stack
    tech = doc.get('tech_stack', [])
    if tech:
        parts.append(f"Technologies: {', '.join(tech)}")
    
    # Add use cases
    use_cases = doc.get('use_cases', [])
    if use_cases:
        parts.append(f"Use Cases: {'; '.join(use_cases)}")
    
    # Add complexity level
    complexity = doc.get('complexity_level', 'Unknown')
    parts.append(f"Complexity: {complexity}")
    
    # Add tags
    tags = doc.get('tags', [])
    if tags:
        parts.append(f"Tags: {', '.join(tags)}")
    
    # Join all parts with newlines
    return '\n'.join(parts)

### Generate Embeddings with sBERT

In [31]:
from sentence_transformers import SentenceTransformer

# Use any sBERT model (384-dim is typical for MiniLM)
model = SentenceTransformer("all-MiniLM-L6-v2")  # dim=384

vectors = []
for i, doc in enumerate(docs):
    text = prepare_text_for_embedding(doc)
    emb = model.encode(text).tolist()  # convert to list for Pinecone

    vectors.append({
        "id": doc.get("repo_name", f"doc-{i}"),
        "values": emb,
        "metadata": {
            "repo_name": doc.get("repo_name", "unknown"),
            "repo_url": doc.get("repo_url", "unknown"),
            "description": doc.get("description", ""),
            "tech_stack": doc.get("tech_stack", []),
            "skills": doc.get("key_skills", []),
            "use_cases": doc.get("use_cases", []),
            "complexity": doc.get("complexity_level", "")
        }
    })



'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: e851d11b-73b6-48cf-a5af-7f3648b7f851)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


In [33]:
vectors[16]

{'id': 'Northwind-Sales-Analysis',
 'values': [0.019042545929551125,
  0.03667434677481651,
  -0.048668745905160904,
  -0.011946066282689571,
  -0.12781919538974762,
  0.01429382711648941,
  -0.006978760007768869,
  -0.03406502678990364,
  -0.04196447879076004,
  -0.018293412402272224,
  -0.06484587490558624,
  0.019540147855877876,
  0.06442423164844513,
  -0.02072153240442276,
  0.0010317630367353559,
  0.03405885025858879,
  0.04692571610212326,
  -0.0499984472990036,
  0.006551666185259819,
  0.01413897518068552,
  -0.031610555946826935,
  0.0057698399759829044,
  -0.056448884308338165,
  -0.01695774681866169,
  0.06309626996517181,
  -0.02708897925913334,
  0.03766566142439842,
  0.01354946568608284,
  -0.01626916229724884,
  -0.06541453301906586,
  -0.09043028205633163,
  0.012773041613399982,
  0.020328011363744736,
  0.05851664021611214,
  -0.04547664150595665,
  -0.04380828142166138,
  0.05624281242489815,
  0.060844291001558304,
  0.033567506819963455,
  -0.011851618997752666

### Upload to Pinecone

In [36]:
# Upload in batches (to avoid limits)
for i in range(0, len(vectors), 100):
    batch = vectors[i : i+100]
    index.upsert(vectors=batch)

print(f"✅ Uploaded {len(vectors)} vectors to Pinecone index '{index_name}'")


✅ Uploaded 19 vectors to Pinecone index 'proj-index'


In [39]:
query = "Any projects on clustering"

q_emb = model.encode(query).tolist()

results = index.query(vector=q_emb, top_k=2, include_metadata=True)

print("🔍 Top Matches:")
for match in results.matches:
    print(f"- {match.metadata['repo_url']} ({match.score:.3f})")
    print("  Description:", match.metadata['description'])


🔍 Top Matches:
- https://github.com/cleavestone/Customer-Segmentation (0.444)
  Description: This project applies RFM Analysis on transactional data from an e-commerce platform to identify distinct customer groups and recommend tailored engagement strategies for each. The project uses clustering techniques to segment customers based on their transaction patterns, enabling data-driven business decisions. The goal is to drive personalized marketing strategies and improve overall customer retention. The project provides actionable insights and visualizations to help businesses understand their customers better.
- https://github.com/cleavestone/Data-Analytics (0.395)
  Description: Unknown


### Query Pinecone (Retrieve)

In [40]:
def retrieve_from_pinecone(query, top_k=3):
    q_emb = model.encode(query).tolist()
    results = index.query(vector=q_emb, top_k=top_k, include_metadata=True)
    return results.matches


### Build Context

In [41]:
def build_context(matches):
    context = ""
    for m in matches:
        context += f"- Repo_name: {m.metadata.get('repo_name','unknown')}\n"
        context += f"- Repo_Url: {m.metadata.get('repo_url', 'unknown')}\n"
        context += f"  Description: {m.metadata.get('description', '')}\n"
        context += f"  Tech: {', '.join(m.metadata.get('tech_stack', []))}\n"
        context += f"  Skills: {', '.join(m.metadata.get('skills', []))}\n"
        context += f"  Use Cases: {', '.join(m.metadata.get('use_cases', []))}\n\n"
    return context


### Generate

In [43]:
import requests

def generate_answer(query, context):
    prompt = f"""
    You are a helpful assistant answering about my GitHub projects.

    User query: {query}

    Relevant project summaries:
    {context}

    Based on the above, provide a clear and concise answer.
    If relevant, include the repo URL(s).
    """

    data = {
        "model": "llama-3.3-70b-versatile",  # or "llama-3.3-70b-versatile" if available
        "messages": [
            {"role": "system", "content": "You are a helpful AI assistant for project retrieval."},
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers={"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"},
        json=data
    )

    return response.json()["choices"][0]["message"]["content"]


In [44]:
def rag_pipeline(user_query):
    matches = retrieve_from_pinecone(user_query)
    if not matches:
        return "No relevant projects found."

    context = build_context(matches)
    answer = generate_answer(user_query, context)
    return answer

# Example
query = "Do you have any project related to cancer detection?"
print(rag_pipeline(query))


Yes, I have projects related to cancer detection. Specifically, I have two repositories that focus on classifying chest X-rays to detect lung cancer adenocarcinoma:

1. **ChestXray-Cancer-Detection**: This project utilizes a modular production-ready pipeline with tools like DVC, MLflow, and Dagshub for experiment tracking, reproducibility, and scalability. You can find it at: https://github.com/cleavestone/ChestXray-Cancer-Detection
2. **Chest X-ray Cancer Classification**: This project provides an end-to-end deep learning system that classifies chest X-ray images as normal or adenocarcinoma using VGG16 transfer learning. The repository is located at: https://github.com/cleave/Chest X-ray Cancer Classification

Both projects aim to assist in the early and accurate diagnosis of lung cancer, potentially improving patient outcomes.
