In [1]:
import os
import json
import sys
import random
import re
from datetime import datetime
from typing import List, Dict
import warnings

src_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if src_path not in sys.path:
    sys.path.append(src_path)

from utils.helper import load_env
from haystack import Pipeline
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.anthropic import AnthropicGenerator
from haystack.utils import Secret

import requests
from urllib.parse import urlparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Ignore notebook warnings
warnings.filterwarnings('ignore')
load_env()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

Found .env file at: /Users/firatgelbal/code/gelbal/pr-review-style-imitate/.env
load_dotenv() result: True


In [3]:
# Global cache for vectorized PR data
PR_VECTORS = {}

In [4]:
def load_enriched_prs(repo_name: str) -> Dict:
    """Load enriched PRs and create lookup tables"""
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    enriched_file = os.path.join(project_root, 'data', 'processed', repo_name, f"{repo_name}_enriched_prs.json")

    with open(enriched_file, 'r') as f:
        enriched_prs = json.load(f)

    # Filter out confidential PRs or not classified PRs
    non_confidential_prs = [
        pr for pr in enriched_prs
        if pr.get('confidentiality', {}).get('is_confidential') == False
    ]

    # Create file-based lookup
    files_lookup = {}
    for pr in non_confidential_prs:
        if 'diff_content' in pr:
            files = parse_diff_content(pr['diff_content'])
            for file in files:
                if file['filename'] not in files_lookup:
                    files_lookup[file['filename']] = []
                files_lookup[file['filename']].append(pr)

    return {
        'prs': non_confidential_prs,
        'files_lookup': files_lookup
    }

def parse_diff_content(diff_content: str) -> List[Dict]:
    """Parse diff content into structured format"""
    files = []
    current_file = None
    current_diff = []

    for line in diff_content.split('\n'):
        if line.startswith('diff --git'):
            if current_file:
                files.append({
                    "filename": current_file,
                    "diff": '\n'.join(current_diff)
                })
            current_file = line.split()[-1].replace('b/', '')
            current_diff = []
        else:
            current_diff.append(line)

    # Add the last file
    if current_file:
        files.append({
            "filename": current_file,
            "diff": '\n'.join(current_diff)
        })

    return files

In [5]:
def get_pr_by_number(repo_name: str, pr_number: int) -> Dict:
    """Get PR data by PR number from enriched PRs"""
    enriched_data = load_enriched_prs(repo_name)

    for pr in enriched_data['prs']:
        # Extract PR number from html_url
        url_pr_number = int(pr['html_url'].split('/')[-1])
        if url_pr_number == pr_number:
            return pr

    raise ValueError(f"PR #{pr_number} not found in enriched data")

def get_prs_by_files(reviewer_name: str, repo_name: str, file_paths: List[str], max_prs: int = 30) -> List[Dict]:
    """Get past PRs that modified the same files"""
    # Load enriched PR data
    enriched_data = load_enriched_prs(repo_name)

    # Get PRs that touched any of the given files
    relevant_prs = set()
    for file_path in file_paths:
        if file_path in enriched_data['files_lookup']:
            relevant_prs.update(pr['html_url'] for pr in enriched_data['files_lookup'][file_path])

    # Get full PR data and sort by recency
    prs = [pr for pr in enriched_data['prs'] if pr['html_url'] in relevant_prs]
    prs.sort(key=lambda x: x.get('updated_at', ''), reverse=True)

    return prs[:max_prs]

def get_similar_prs_by_diff(repo_name: str, current_diff: str, max_prs: int = 30) -> List[Dict]:
    """Find similar PRs based on diff content using cached vectorizer"""
    global PR_VECTORS
    enriched_data = load_enriched_prs(repo_name)

    # Initialize or get cached vectors
    if repo_name not in PR_VECTORS:
        print(f"Initializing PR vectors for {repo_name}...")
        vectorizer = TfidfVectorizer(stop_words='english')
        pr_diffs = [pr.get('diff_content', '') for pr in enriched_data['prs']]
        tfidf_matrix = vectorizer.fit_transform(pr_diffs)
        PR_VECTORS[repo_name] = {
            'vectorizer': vectorizer,
            'tfidf_matrix': tfidf_matrix,
            'pr_indices': {i: pr for i, pr in enumerate(enriched_data['prs'])}
        }

    # Get cached vectors
    vectorizer = PR_VECTORS[repo_name]['vectorizer']
    tfidf_matrix = PR_VECTORS[repo_name]['tfidf_matrix']
    pr_indices = PR_VECTORS[repo_name]['pr_indices']

    # Calculate similarity with current PR
    current_vector = vectorizer.transform([current_diff])
    similarity_scores = cosine_similarity(current_vector, tfidf_matrix)[0]

    # Get most similar PRs with scores
    similar_indices = similarity_scores.argsort()[-max_prs:][::-1]
    similar_prs = []

    for idx in similar_indices:
        pr = pr_indices[idx].copy()  # Create a copy to add similarity score
        pr['similarity_score'] = f"{similarity_scores[idx]:.2f}"
        similar_prs.append(pr)

    return similar_prs

In [6]:
apply_review_template = """
You are an AI code review assistant tasked with reviewing a pull request in the style of a specific reviewer. Your goal is to provide feedback that matches the reviewer's style, focus areas, and characteristics described in the instructions.

Review Instructions:
{{instructions}}

Here are some recent example PRs that are related to this new PR for reference:
{% for pr in example_prs %}
Example PR {{loop.index}}:
Title: {{pr.title}}
Description: {{pr.body}}

Reviews:
{% for review in pr.reviews %}
{% if review.user_login == pr.reviewer_login %}
- {{review.body}}
{% endif %}
{% endfor %}

Comments:
{% for comment in pr.review_comments %}
{% if comment.user_login == pr.reviewer_login %}
- File: {{comment.path}}
  Line: {{comment.line}}
  Comment: {{comment.body}}
{% endif %}
{% endfor %}

{% endfor %}

When reviewing the PR below, pay special attention to:
1. How the reviewer typically structures their feedback
2. Common patterns in technical focus areas
3. The reviewer's tone and communication style
4. How they balance different types of feedback
5. Their typical approach to code organization and patterns
6. How they handle documentation and testing requirements

Pull Request to Review:
Title: {{pr_title}}
Description: {{pr_description}}
Files Changed:
{% for file in files_changed %}
File: {{file.filename}}
Changes:
```
{{file.diff}}
```
{% endfor %}

Please provide a comprehensive review in the reviewer's style, including both overall feedback and specific file-level comments.

Output the review in this format:
OVERALL FEEDBACK:
(General feedback about the PR)

FILE COMMENTS:
[file_path]
- Line X: (comment)
- Line Y: (comment)

[another_file_path]
- Line Z: (comment)
"""

apply_review_prompt_builder = PromptBuilder(template=apply_review_template)


In [7]:
def apply_reviewer_style(reviewer_name: str, repo_name: str, pr_title: str, pr_description: str,
                        files_changed: List[Dict], num_examples: int = 20) -> Dict[str, str]:
    """Apply reviewer's style to generate PR review"""
    # Load reviewer instructions
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    instructions_file = os.path.join(project_root, 'data', 'reviewer_instructions',
                                   repo_name, f"{reviewer_name}_review_instructions.txt")

    with open(instructions_file, 'r', encoding='utf-8') as f:
        instructions = f.read()

    # Get relevant PRs by file paths
    file_paths = [f['filename'] for f in files_changed]
    file_history_prs = get_prs_by_files(reviewer_name, repo_name, file_paths, max_prs=100)

    # Get similar PRs by diff content
    current_diff = '\n'.join(f['diff'] for f in files_changed)
    similar_prs = get_similar_prs_by_diff(repo_name, current_diff, max_prs=100)

    # Combine and deduplicate example PRs
    seen_pr_ids = set()
    candidate_prs = []
    for pr in file_history_prs + similar_prs:
        if pr['html_url'] not in seen_pr_ids:
            seen_pr_ids.add(pr['html_url'])
            candidate_prs.append(pr)

    # Randomly select num_examples PRs from candidates
    example_prs = random.sample(candidate_prs, min(num_examples, len(candidate_prs)))

    # Prepare prompt input
    input_dict = {
        "instructions": instructions,
        "example_prs": example_prs,
        "pr_title": pr_title,
        "pr_description": pr_description,
        "files_changed": files_changed
    }

    # Set up pipeline
    pipeline = Pipeline()
    pipeline.add_component("prompt_builder", apply_review_prompt_builder)
    pipeline.add_component(
        "llm",
        AnthropicGenerator(
            api_key=Secret.from_token(ANTHROPIC_API_KEY),
            model="claude-3-5-sonnet-20241022",
            generation_kwargs={
                "max_tokens": 8192,  # Max possible in Claude 3.5 sonnet
                "temperature": 0.3
            }
        )
    )
    pipeline.connect("prompt_builder.prompt", "llm.prompt")

    # Run pipeline
    result = pipeline.run(data=input_dict)
    review_text = result["llm"]["replies"][0]

    # Parse the response into sections
    try:
        overall_feedback = re.search(r"OVERALL FEEDBACK:\s*(.*?)(?:\n\nFILE COMMENTS:|\Z)",
                                   review_text, re.DOTALL).group(1).strip()
        file_comments_text = re.search(r"FILE COMMENTS:\s*(.*?)$", review_text, re.DOTALL)
        file_comments = file_comments_text.group(1).strip() if file_comments_text else ""
    except Exception as e:
        print(f"Error parsing LLM response: {e}")
        overall_feedback = review_text
        file_comments = ""

    return {
        "overall_feedback": overall_feedback,
        "file_comments": file_comments
    }

In [8]:
def review_current_changes(reviewer_name: str, repo_name: str, pr_number: int) -> Dict[str, str]:
    """Review PR changes using specified reviewer's style"""
    try:
        # Get PR data
        pr_data = get_pr_by_number(repo_name, pr_number)

        # Generate review
        review = apply_reviewer_style(
            reviewer_name=reviewer_name,
            repo_name=repo_name,
            pr_title=pr_data["title"],
            pr_description=pr_data["body"],
            files_changed=parse_diff_content(pr_data["diff_content"]),
            num_examples=50
        )

        # Save review result
        save_review_result(repo_name, pr_number, reviewer_name, review)

        print(f"\nGenerated review in {reviewer_name}'s style for PR #{pr_number}:\n")
        print("Overall Feedback:")
        print("-" * 50)
        print(review["overall_feedback"])
        print("\nFile Comments:")
        print("-" * 50)
        print(review["file_comments"])

        return review

    except Exception as e:
        print(f"Error generating review: {e}")
        return None


def save_review_result(repo_name: str, pr_number: int, reviewer_name: str, review_data: Dict):
    """Save the generated review to disk"""
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
    reviews_dir = os.path.join(project_root, 'data', 'generated_reviews', repo_name)
    os.makedirs(reviews_dir, exist_ok=True)

    output_file = os.path.join(reviews_dir, f"{repo_name}_pr_{pr_number}_{reviewer_name}_review.json")

    review_metadata = {
        'pr_number': pr_number,
        'reviewer_name': reviewer_name,
        'timestamp': datetime.now().isoformat(),
        'review': review_data
    }

    with open(output_file, 'w') as f:
        json.dump(review_metadata, f, indent=2)

    print(f"Review saved to {output_file}")

In [9]:
#reviewer_name = "gelbal"
# reviewer_name = "deltaWhiskey"
# repo_name = "looker"
# pr_number = 2823

#reviewer_name = "Khrol" "c00pmode" "anandnalya"
reviewer_name = "c00pmode"
repo_name = "nosara"
pr_number = 9711

review = review_current_changes(
    reviewer_name=reviewer_name,
    repo_name=repo_name,
    pr_number=pr_number
)


Initializing PR vectors for nosara...
Review saved to /Users/firatgelbal/code/gelbal/pr-review-style-imitate/data/generated_reviews/nosara/nosara_pr_9711_c00pmode_review.json

Generated review in c00pmode's style for PR #9711:

Overall Feedback:
--------------------------------------------------
💡 The addition of logging is a good practice for observability, but we could make a few improvements to make it even more useful:

1. Consider adding more context around the API request lifecycle:
   - Number of action IDs being processed
   - Success/failure status of each request
   - Response timing information

2. The logging level seems appropriate (INFO), but we might want to consider adding DEBUG level logs for more detailed troubleshooting.

3. ❓ What prompted the need for additional logging? Understanding the context might help us implement a more comprehensive solution.

4. 🔍 The PR would benefit from:
   - Documentation of the logging changes in comments
   - A test case to verify th