In [30]:
from dotenv import load_dotenv
import sqlite3
import json
import pandas as pd
from typing import List, Dict, Any
import openai
import os
from tqdm import tqdm
load_dotenv()  # laddar .env-filen

client = openai.OpenAI()

print("Libraries imported successfully!")

Libraries imported successfully!


In [31]:
# Connect to the commits database
conn = sqlite3.connect('commits.db')

# Check the total number of commits
total_commits = conn.execute("SELECT COUNT(*) FROM commits").fetchone()[0]
print(f"Total commits in database: {total_commits}")

# Check a sample commit to understand the structure
sample_commit = conn.execute("SELECT * FROM commits LIMIT 1").fetchone()
print("\nSample commit structure:")
print(f"Repo: {sample_commit[0]}")
print(f"SHA: {sample_commit[1]}")
print(f"Title: {sample_commit[2]}")
print(f"Diffs preview: {sample_commit[3][:200]}...")

# Parse the diffs JSON to see structure
sample_diffs = json.loads(sample_commit[3])
print(f"\nNumber of files changed in sample commit: {len(sample_diffs)}")
if sample_diffs:
    print(f"First diff keys: {list(sample_diffs[0].keys())}")
    if 'diff' in sample_diffs[0] and sample_diffs[0]['diff']:
        print(f"Sample diff content: {sample_diffs[0]['diff'][:300]}...")

Total commits in database: 30

Sample commit structure:
Repo: https://github.com/hm-group/gcp-projectfactory
SHA: 88ffb0214cde5bbd88697444e89bf3fe84ec3bad
Title: Merge pull request #15955 from hm-group/add-workload-identity-to-trendie
Diffs preview: [{"filename": "projects/trendie-d.yaml", "diff": "@@ -65,3 +65,6 @@ project:\n     - herman.lindstrom@hm.com\n     - andreas.bergstrom@hm.com\n     - manjunath.satishnaik@hm.com\n+  workloadIdentity:\...

Number of files changed in sample commit: 1
First diff keys: ['filename', 'diff']
Sample diff content: @@ -65,3 +65,6 @@ project:
     - herman.lindstrom@hm.com
     - andreas.bergstrom@hm.com
     - manjunath.satishnaik@hm.com
+  workloadIdentity:
+    - repo: trendie-d-a07f-vertex-pipelines
+...


In [32]:
def analyze_diff_with_prompt(filename: str, diff_content: str, commit_title: str, repo: str) -> Dict[str, Any]:
    """
    Analyze a diff using OpenAI to determine if it's interesting.
    """
    
    if not diff_content or diff_content.strip() == "":
        return {
            "score": 0,
            "confidence": 1.0,
            "reason": "Empty diff",
            "category": "empty"
        }
    
    # Truncate diff if it's too long to avoid token limits
    max_diff_length = 2000
    truncated_diff = diff_content[:max_diff_length]
    if len(diff_content) > max_diff_length:
        truncated_diff += "\n... (truncated)"
    
    prompt = f"""
You are an EXTREMELY strict senior software engineer. Rate this code diff on a scale of 0-100 where you are VERY AGGRESSIVE and STINGY with high scores.

Repository: {repo}
Commit Title: {commit_title}
Filename: {filename}

Diff:
```
{truncated_diff}
```

SCORING GUIDELINES (BE RUTHLESS):
- 90-100: Only for EXCEPTIONAL changes like major security fixes, critical performance breakthroughs, or groundbreaking architectural improvements
- 80-89: Significant security vulnerabilities fixed, major performance optimizations, important API changes
- 70-79: Notable bug fixes, meaningful feature additions, important refactoring
- 60-69: Minor bug fixes, small features, routine improvements
- 40-59: Configuration changes, dependency updates, minor refactoring
- 20-39: Test additions, documentation improvements, code style changes
- 0-19: Trivial changes, formatting, comments, variable renames

BE EXTREMELY STRICT. Most commits should score 20-40. Only truly exceptional changes deserve 80+.

Respond with JSON containing:
- "score": integer 0-100 (BE VERY STINGY WITH HIGH SCORES)
- "confidence": float 0.0-1.0 (how confident you are)
- "category": string (one of: "security", "bugfix", "performance", "database", "configuration", "feature", "refactor", "test", "documentation", "infrastructure", "empty", "general")
- "reason": string (brief explanation of the score)

Respond only with valid JSON.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an EXTREMELY strict senior software engineer who is very stingy with high scores. Most changes are routine and should score 20-40. Only exceptional changes deserve 70+."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=200
        )
        
        response_text = response.choices[0].message.content.strip()
        
        # Try to parse the JSON response
        try:
            result = json.loads(response_text)
            
            # Validate required fields
            required_fields = ["score", "confidence", "category", "reason"]
            if not all(field in result for field in required_fields):
                raise ValueError("Missing required fields in response")
            
            # Ensure score is between 0 and 100
            result["score"] = max(0, min(100, int(result["score"])))
            
            # Ensure confidence is between 0 and 1
            result["confidence"] = max(0.0, min(1.0, float(result["confidence"])))
            
            # Add metadata
            result["file_type"] = filename.split('.')[-1] if '.' in filename else "unknown"
            result["lines_added"] = truncated_diff.count('\n+')
            result["lines_removed"] = truncated_diff.count('\n-')
            result["total_changes"] = result["lines_added"] + result["lines_removed"]
            
            return result
            
        except (json.JSONDecodeError, ValueError) as e:
            print(f"Error parsing LLM response for {filename}: {e}")
            print(f"Response was: {response_text}")
            
            # Fallback to basic analysis
            return {
                "score": min(30, len(truncated_diff) // 50),  # Very conservative fallback scoring
                "confidence": 0.3,
                "reason": f"LLM parsing failed: {str(e)}",
                "category": "general",
                "file_type": filename.split('.')[-1] if '.' in filename else "unknown",
                "lines_added": truncated_diff.count('\n+'),
                "lines_removed": truncated_diff.count('\n-'),
                "total_changes": truncated_diff.count('\n+') + truncated_diff.count('\n-')
            }
            
    except Exception as e:
        print(f"Error calling OpenAI API for {filename}: {e}")
        
        # Fallback analysis
        return {
            "score": 0,
            "confidence": 0.1,
            "reason": f"API call failed: {str(e)}",
            "category": "general",
            "file_type": filename.split('.')[-1] if '.' in filename else "unknown",
            "lines_added": 0,
            "lines_removed": 0,
            "total_changes": 0
        }

print("LLM-powered diff analysis function defined!")

LLM-powered diff analysis function defined!


In [33]:
def process_all_commits(limit=None):
    """
    Process all commits and analyze each diff.
    """
    query = "SELECT repo, sha, title, diffs FROM commits"
    if limit:
        query += f" LIMIT {limit}"
    
    cursor = conn.execute(query)
    results = []
    
    for row in tqdm(cursor.fetchall(), desc="Processing commits"):
        repo, sha, title, diffs_json = row
        
        try:
            diffs = json.loads(diffs_json)
        except json.JSONDecodeError:
            print(f"Error parsing JSON for commit {sha}")
            continue
        
        # Analyze each file diff in the commit
        for diff_data in diffs:
            filename = diff_data.get('filename', 'unknown')
            diff_content = diff_data.get('diff', '')
            
            analysis = analyze_diff_with_prompt(filename, diff_content, title, repo)
            
            result = {
                'repo': repo,
                'sha': sha,
                'commit_title': title,
                'filename': filename,
                'diff_length': len(diff_content) if diff_content else 0,
                **analysis  # Unpack all analysis results
            }
            
            results.append(result)
    
    return pd.DataFrame(results)

# Process a small sample first to test
print("Processing first 3 commits as a test...")
sample_df = process_all_commits(limit=3)
print(f"Processed {len(sample_df)} file diffs from 3 commits")
print(f"Score distribution:")
print(f"  High scores (70+): {(sample_df['score'] >= 70).sum()}")
print(f"  Medium scores (40-69): {((sample_df['score'] >= 40) & (sample_df['score'] < 70)).sum()}")
print(f"  Low scores (0-39): {(sample_df['score'] < 40).sum()}")
print(f"Average score: {sample_df['score'].mean():.1f}")
sample_df.head()

Processing first 3 commits as a test...


Processing commits: 100%|██████████| 3/3 [00:10<00:00,  3.64s/it]

Processed 3 file diffs from 3 commits
Score distribution:
  High scores (70+): 0
  Medium scores (40-69): 0
  Low scores (0-39): 3
Average score: 35.0





Unnamed: 0,repo,sha,commit_title,filename,diff_length,score,confidence,category,reason,file_type,lines_added,lines_removed,total_changes
0,https://github.com/hm-group/gcp-projectfactory,88ffb0214cde5bbd88697444e89bf3fe84ec3bad,Merge pull request #15955 from hm-group/add-wo...,projects/trendie-d.yaml,192,35,0.9,configuration,The commit represents a minor configuration ch...,yaml,3,0,3
1,https://github.com/hm-group/gcp-projectfactory,b1cc2b81cd112bf50632dc3a71ef61082a1149d6,Merge pull request #16004 from hm-group/projec...,projects/ml-sa-exploration.yaml,384,35,0.9,configuration,This change is a simple configuration addition...,yaml,17,0,17
2,https://github.com/hm-group/gcp-projectfactory,adbe6673d057942e4f3e135140d4a8e074c10cfb,Adds requested project,projects/ml-sa-exploration.yaml,384,35,0.9,configuration,This commit involves the addition of a new pro...,yaml,17,0,17
