In [1]:
import os
import json
import sys
from typing import List, Dict, Set, Tuple, Optional
from datetime import datetime
import warnings
from dataclasses import dataclass
from pydantic import BaseModel, Field

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
# Ignore notebook warnings
warnings.filterwarnings('ignore')

In [3]:
def get_project_root() -> str:
    """Get project root path consistently"""
    current_dir = os.getcwd()
    return os.path.abspath(os.path.join(current_dir, '..', '..')) if current_dir.endswith('notebooks') else current_dir

def get_data_dir(repo_name: str, data_type: str = "input", user_type: Optional[str] = None) -> str:
    """Get data directory path with proper directory structure

    Args:
        repo_name: Name of the repository
        data_type: Type of data (input/processed)
        user_type: Type of user data (author/reviewer)

    Returns:
        Path to the appropriate directory
    """
    project_root = get_project_root()

    # Handle user-specific directories for processed data
    if data_type == "processed" and user_type:
        return os.path.join(project_root, 'data', data_type, f"{user_type}_prs", repo_name)

    # Handle input data and other processed data
    return os.path.join(project_root, 'data', data_type, repo_name)

In [4]:
def read_json_file(filepath: str) -> List[Dict]:
    """Read JSON file safely"""
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return []

    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def read_prs_from_json(repo_name: str) -> List[Dict]:
    """Read PRs from the JSON file"""
    data_dir = get_data_dir(repo_name)
    return read_json_file(os.path.join(data_dir, "pull_requests.json"))

In [5]:
def parse_diff_content(diff_content: str) -> List[Dict]:
    """Parse diff content into structured format"""
    files = []
    current_file = None
    current_diff = []

    for line in diff_content.split('\n'):
        if line.startswith('diff --git'):
            if current_file:
                files.append({
                    "filename": current_file,
                    "diff": '\n'.join(current_diff)
                })
            current_file = line.split()[-1].replace('b/', '')
            current_diff = []
        else:
            current_diff.append(line)

    # Add the last file
    if current_file:
        files.append({
            "filename": current_file,
            "diff": '\n'.join(current_diff)
        })

    return files

In [6]:
from haystack import Pipeline
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack.utils import Secret

In [7]:
from pydantic import BaseModel, Field
from typing import List

class PRConfidentialityClassification(BaseModel):
    is_confidential: bool = Field(description="Whether the PR contains confidential information")
    reasoning: str = Field(description="Explanation for the classification")

In [8]:
import pydantic
from pydantic import ValidationError
from typing import Optional
from colorama import Fore
from haystack import component

@component
class OutputValidator:
    def __init__(self, pydantic_model: pydantic.BaseModel):
        self.pydantic_model = pydantic_model
        self.iteration_counter = 0

    @component.output_types(valid_replies=List[str], invalid_replies=Optional[List[str]], error_message=Optional[str])
    def run(self, replies: List[str]):
        self.iteration_counter += 1

        try:
            output_dict = json.loads(replies[0])
            # My local model sometimes returns a 'properties' wrapper. But the content adheres to the schema.
            if 'properties' in output_dict:
                output_dict = output_dict['properties']
            self.pydantic_model.parse_obj(output_dict)
            print(
                Fore.GREEN
                + f"OutputValidator at Iteration {self.iteration_counter}: Valid JSON from LLM - No need for looping: {replies[0]}"
            )
            return {"valid_replies": [replies[0]]}

        except (ValueError, ValidationError) as e:
            print(
                Fore.RED
                + f"OutputValidator at Iteration {self.iteration_counter}: Invalid JSON from LLM - Let's try again.\n"
                f"Output from LLM:\n {replies[0]} \n"
                f"Error from OutputValidator: {e}"
            )
            return {"invalid_replies": replies, "error_message": str(e)}

In [9]:
prompt_template = """
Analyze the following Pull Request and determine if it contains any confidential or sensitive information that should not be publicly visible. Create a complete, valid JSON object with your classification and reasoning.

Pull Request:
Title: {{pr['title']}}
Description: {{pr['body']}}
Repository: {{pr['repository']}}
Labels: {{pr['label_names']}}

Changes:
{% for file in files_changed %}
File: {{file.filename}}
```
{{file.diff}}
```
{% endfor %}

{% if reviews %}
Reviews:
{% for review in reviews %}
- Reviewer: {{review['user_login']}}
  Comment: {{review['body']}}
{% endfor %}
{% endif %}

{% if review_comments %}
Review Comments:
{% for comment in review_comments %}
- Commenter: {{comment['user_login']}}
  Comment: {{comment['body']}}
  {% if comment.get('path') %}  File: {{comment['path']}}{% endif %}
{% endfor %}
{% endif %}

Guidelines for classification:
1. Confidential information includes:
   - Database credentials, API keys, or authentication tokens
   - Internal metrics, KPIs, or business performance data
   - Customer data or usage patterns
   - Unreleased product features or development plans
   - Security vulnerabilities or fixes

2. Non-confidential information includes:
   - General code changes and improvements
   - Public documentation updates
   - Bug fixes without security implications
   - UI/UX improvements
   - Performance optimizations without internal metrics
   - Public API changes
   - Open source dependencies updates
   - General refactoring
   - Test additions or updates
   - Public feature documentation

3. When in doubt, classify as confidential. Better to be cautious with potentially sensitive information.

4. Consider the entire PR context:
   - Check if title contains internal information
   - Check for sensitive data in code changes
   - Review any mentioned customer names or personal information

Keep your response short. Follow this JSON schema for your response:
{
    "is_confidential": boolean,
    "reasoning": string # maximum 2 short sentences
}

Make sure your response is a complete, valid JSON object (dict) and not a list. Ensure all opening brackets, braces, and quotes have matching closing ones.

{% if invalid_replies and error_message %}
  You already created the following output in a previous attempt: {{invalid_replies}}
  However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}
  Correct the output and try again. Just return the corrected output without any extra explanations.
{% endif %}
"""

prompt_builder = PromptBuilder(template=prompt_template)

In [10]:
class ConfidentialityClassification(BaseModel):
    is_confidential: bool = Field(description="Whether the PR contains confidential information")
    reasoning: str = Field(description="Explanation for the classification")

# Custom exceptions
class PRClassificationError(Exception):
    """Raised when PR classification fails"""
    pass

class FileNotFoundError(Exception):
    """Raised when required input files are missing"""
    pass

In [None]:
# Initialize the OutputValidator with the Pydantic model
output_validator = OutputValidator(pydantic_model=ConfidentialityClassification)

generator = OllamaGenerator(model="qwen2.5",
                            url = "http://localhost:11434",
                            generation_kwargs={
                            "num_predict": 100,
                            "temperature": 0.7,
                            })

pipeline = Pipeline(max_runs_per_component=3)

# Add components to the pipeline
pipeline.add_component(instance=prompt_builder, name="prompt_builder")
pipeline.add_component(instance=generator, name="llm")
pipeline.add_component(instance=output_validator, name="output_validator")

# Connect the components
pipeline.connect("prompt_builder", "llm")
pipeline.connect("llm", "output_validator")
pipeline.connect("output_validator.invalid_replies", "prompt_builder.invalid_replies")
pipeline.connect("output_validator.error_message", "prompt_builder.error_message")

pipeline.show()

In [12]:
class PRClassifier:
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.max_retries = 3

    def _validate_classification(self, data: Dict) -> bool:
        """Validate classification has required fields with correct types"""
        try:
            # Check required fields exist
            if 'is_confidential' not in data or 'reasoning' not in data:
                print("Missing required fields")
                return False

            # Check types
            if not isinstance(data['is_confidential'], bool):
                print(f"Invalid type for is_confidential: {type(data['is_confidential'])}")
                return False

            if not isinstance(data['reasoning'], str):
                print(f"Invalid type for reasoning: {type(data['reasoning'])}")
                return False

            return True

        except Exception as e:
            print(f"Validation error: {str(e)}")
            return False

    def classify_pr(self, pr: Dict) -> Dict:
        """Classify a PR using the LLM pipeline with retries"""
        last_error = None

        for attempt in range(self.max_retries):
            try:
                files_changed = parse_diff_content(pr['diff_content'])
                result = self.pipeline.run({
                    "pr": pr,
                    "files_changed": files_changed,
                    "reviews": pr.get('reviews', []),
                    "review_comments": pr.get('review_comments', [])
                })

                if "output_validator" in result and "valid_replies" in result["output_validator"]:
                    llm_output = result["output_validator"]["valid_replies"][0]
                    classification = json.loads(llm_output)

                    if self._validate_classification(classification):
                        pr['confidentiality'] = {
                            'is_confidential': classification['is_confidential'],
                            'reasoning': classification['reasoning'],
                            'classified_at': datetime.now().isoformat()
                        }
                        return pr

                print(f"Attempt {attempt + 1}: Invalid classification format")
                print(f"LLM output: {llm_output}")

            except Exception as e:
                last_error = e
                print(f"Attempt {attempt + 1} failed: {str(e)}")

        raise PRClassificationError(f"Failed to classify PR {pr['id']} after {self.max_retries} attempts: {str(last_error)}")

def save_user_data(repo_name: str, user: str, user_type: str, user_data: Dict):
    """Save user data to the appropriate file"""
    user_dir = get_data_dir(repo_name, "processed", user_type)
    os.makedirs(user_dir, exist_ok=True)

    filepath = os.path.join(user_dir, f"{user}.json")
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(user_data, f, indent=2, ensure_ascii=False)


In [13]:
def update_confidentiality_status(repo_name: str, classifier, batch_size: int = 10):
    """Update confidentiality status for all PRs in a repo using classify_pr"""
    # Load enriched PRs
    processed_dir = get_data_dir(repo_name, "processed")
    enriched_file = os.path.join(processed_dir, f"{repo_name}_enriched_prs.json")

    with open(enriched_file, 'r') as f:
        enriched_prs = json.load(f)

    # Create lookup for existing PRs
    pr_lookup = {pr['html_url']: pr for pr in enriched_prs}
    print(f"Processing {len(enriched_prs)} PRs for confidentiality check")

    # Process PRs in batches
    current_batch = []
    for i in range(0, len(enriched_prs), batch_size):
        batch = enriched_prs[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{len(enriched_prs)//batch_size + 1}")

        for pr in batch:
            try:
                # Skip if already classified
                if 'confidentiality' in pr and pr['confidentiality'].get('classified_at'):
                    print(f"Skipping already classified PR #{pr['html_url'].split('/')[-1]}")
                    continue

                # Use classify_pr to determine confidentiality
                classified_pr = classifier.classify_pr(pr)

                # Update lookup and add to current batch
                pr_lookup[pr['html_url']] = classified_pr
                current_batch.append(classified_pr)

            except Exception as e:
                print(f"Error processing PR {pr['html_url']}: {e}")

        # Save after each batch, preserving all existing data
        if current_batch:
            print(f"Saving batch of {len(current_batch)} newly classified PRs...")
            with open(enriched_file, 'w') as f:
                json.dump(list(pr_lookup.values()), f, indent=2)
            current_batch = []

    # Save any remaining PRs
    if current_batch:
        print(f"Saving final batch of {len(current_batch)} newly classified PRs...")
        with open(enriched_file, 'w') as f:
            json.dump(list(pr_lookup.values()), f, indent=2)

In [None]:
classifier = PRClassifier(pipeline)
repo_name = "nosara"
#repo_name = "looker"
update_confidentiality_status(repo_name, classifier, batch_size=100)