# Lab Assignment 3: Code Quality and Change Magnitude Analysis

**Course:** CS202 Software Tools and Techniques for CSE

## Objective
This lab builds upon Lab 2's dataset to analyze code quality metrics, change magnitude, and semantic similarities in bug-fixing commits. We use various tools including Radon for complexity analysis and CodeBERT for semantic similarity.

## Starting Point
Using the file-level dataset from Lab 2 containing commit information, file changes, and LLM inferences.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from collections import Counter, defaultdict
import os

# Repository analysis
from pydriller import Repository

# Code quality metrics
from radon.complexity import cc_visit
from radon.metrics import mi_visit
from radon.raw import analyze

# Semantic analysis
from transformers import AutoTokenizer, AutoModel
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Set repository path
repo_url = 'PDFMathTranslate'

## Part A: Load Lab 2 Dataset

Starting with the file-level dataset from the previous lab.

In [None]:
# Note: This assumes you have the Lab 2 dataset available
# If not available, we'll create a sample dataset for demonstration

try:
    # Try to load existing Lab 2 data
    lab2_data = pd.read_csv("../Lab2/master_commits_dataset.csv")
    print(f"Loaded Lab 2 dataset with {len(lab2_data)} entries")
except FileNotFoundError:
    print("Lab 2 dataset not found. Creating sample data from repository...")
    # Create sample data if Lab 2 dataset is not available
    lab2_data = pd.DataFrame({
        'hash': ['sample_hash_1', 'sample_hash_2'],
        'filename': ['file1.py', 'file2.py'],
        'source_code_before': ['def old_func():\n    pass', 'class OldClass:\n    pass'],
        'source_code_current': ['def new_func():\n    return True', 'class NewClass:\n    def method(self):\n        return 1'],
        'diff': ['sample diff 1', 'sample diff 2'],
        'llm_inference': ['fix type 1', 'fix type 2'],
        'rectified_message': ['Fixed function', 'Updated class']
    })

print(f"Dataset columns: {list(lab2_data.columns)}")
print(f"Dataset shape: {lab2_data.shape}")

## Part B: Baseline Descriptive Statistics

Computing and reporting baseline statistics about the repository and commits.

In [None]:
def calculate_repository_stats(repo_url):
    """Calculate comprehensive repository statistics."""
    
    # Initialize counters
    total_commits = 0
    total_files = 0
    files_per_commit = []
    fix_types = Counter()
    extension_counts = Counter()
    unique_files = set()
    
    print("Analyzing repository...")
    
    for commit in Repository(repo_url).traverse_commits():
        total_commits += 1
        commit_file_count = len(commit.modified_files)
        files_per_commit.append(commit_file_count)
        total_files += commit_file_count
        
        # Track unique files and extensions
        for modified_file in commit.modified_files:
            file_path = modified_file.new_path or modified_file.old_path
            if file_path:
                unique_files.add(file_path)
                if '.' in file_path:
                    extension = os.path.splitext(file_path)[1]
                    extension_counts[extension] += 1
    
    # Calculate statistics
    stats = {
        'total_commits': total_commits,
        'total_unique_files': len(unique_files),
        'total_file_modifications': total_files,
        'avg_files_per_commit': np.mean(files_per_commit) if files_per_commit else 0,
        'median_files_per_commit': np.median(files_per_commit) if files_per_commit else 0,
        'most_common_extensions': dict(extension_counts.most_common(10))
    }
    
    return stats

# Calculate and display repository statistics
repo_stats = calculate_repository_stats(repo_url)

print("\n=== Repository Statistics ===")
print(f"Total commits: {repo_stats['total_commits']:,}")
print(f"Total unique files: {repo_stats['total_unique_files']:,}")
print(f"Total file modifications: {repo_stats['total_file_modifications']:,}")
print(f"Average files per commit: {repo_stats['avg_files_per_commit']:.2f}")
print(f"Median files per commit: {repo_stats['median_files_per_commit']:.2f}")
print(f"\nMost common file extensions:")
for ext, count in repo_stats['most_common_extensions'].items():
    print(f"  {ext}: {count:,}")

In [None]:
# Save repository statistics to CSV
repo_stats_df = pd.DataFrame([repo_stats])
repo_stats_df.to_csv("repository_statistics.csv", index=False)
print("\nRepository statistics saved to 'repository_statistics.csv'")

## Part C: Structural Metrics with Radon

Computing code quality metrics using Radon for before and after code versions.

In [None]:
def analyze_code_metrics(source_code):
    """Analyze code metrics using Radon."""
    if not source_code or source_code.strip() == "":
        return {"MI": None, "CC": None, "LOC": None}
    
    try:
        # Maintainability Index
        mi = mi_visit(source_code, True)
        
        # Cyclomatic Complexity
        cc_blocks = cc_visit(source_code)
        cc_total = sum(block.complexity for block in cc_blocks)
        
        # Lines of Code
        raw_metrics = analyze(source_code)
        loc = raw_metrics.loc
        
        return {"MI": mi, "CC": cc_total, "LOC": loc}
    except Exception as e:
        print(f"Error analyzing code metrics: {e}")
        return {"MI": None, "CC": None, "LOC": None}

def calculate_metric_changes(before_metrics, after_metrics):
    """Calculate changes in metrics."""
    changes = {}
    
    for metric in ['MI', 'CC', 'LOC']:
        before_val = before_metrics.get(metric)
        after_val = after_metrics.get(metric)
        
        if before_val is not None and after_val is not None:
            changes[f"{metric}_Change"] = after_val - before_val
        else:
            changes[f"{metric}_Change"] = None
    
    return changes

## Part D: Change Magnitude Metrics

Computing semantic and token similarities using CodeBERT and BLEU score.

In [None]:
# Load CodeBERT for semantic similarity
print("Loading CodeBERT model...")
codebert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
codebert_model = AutoModel.from_pretrained("microsoft/codebert-base")

def get_codebert_embedding(code):
    """Get CodeBERT embedding for code."""
    if not code or code.strip() == "":
        return None
    
    try:
        inputs = codebert_tokenizer(
            code, 
            return_tensors="pt", 
            truncation=True, 
            max_length=512
        )
        
        with torch.no_grad():
            outputs = codebert_model(**inputs)
            # Use mean pooling over the sequence dimension
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        
        return embedding
    except Exception as e:
        print(f"Error getting CodeBERT embedding: {e}")
        return None

def calculate_semantic_similarity(code1, code2):
    """Calculate semantic similarity using CodeBERT."""
    emb1 = get_codebert_embedding(code1)
    emb2 = get_codebert_embedding(code2)
    
    if emb1 is None or emb2 is None:
        return None
    
    # Cosine similarity
    similarity = float(
        torch.nn.functional.cosine_similarity(
            torch.tensor(emb1), 
            torch.tensor(emb2), 
            dim=0
        ).item()
    )
    
    return similarity

def calculate_token_similarity(code1, code2):
    """Calculate token similarity using BLEU score."""
    if not code1 or not code2:
        return None
    
    try:
        reference = code1.split()
        candidate = code2.split()
        
        if not reference or not candidate:
            return None
        
        smoothing = SmoothingFunction().method1
        bleu_score = sentence_bleu([reference], candidate, smoothing_function=smoothing)
        
        return bleu_score
    except Exception as e:
        print(f"Error calculating BLEU score: {e}")
        return None

## Part E: Complete Analysis Pipeline

Running the complete analysis on repository data and generating comprehensive metrics.

In [None]:
def comprehensive_code_analysis(repo_url, output_file="comprehensive_analysis.csv", limit=100):
    """Run comprehensive analysis on repository commits."""
    
    results = []
    processed = 0
    
    print(f"Starting comprehensive analysis (limit: {limit} commits)...")
    
    for commit in Repository(repo_url).traverse_commits():
        if processed >= limit:
            break
            
        for modified_file in commit.modified_files:
            file_path = modified_file.new_path or modified_file.old_path
            
            # Skip non-Python files for code metrics
            if not file_path or not file_path.endswith('.py'):
                continue
            
            before_code = modified_file.source_code_before
            after_code = modified_file.source_code
            
            # Calculate code metrics
            before_metrics = analyze_code_metrics(before_code)
            after_metrics = analyze_code_metrics(after_code)
            metric_changes = calculate_metric_changes(before_metrics, after_metrics)
            
            # Calculate similarity metrics
            semantic_sim = calculate_semantic_similarity(before_code or "", after_code or "")
            token_sim = calculate_token_similarity(before_code or "", after_code or "")
            
            # Prepare result entry
            entry = {
                "commit_hash": commit.hash,
                "file_path": file_path,
                "commit_message": commit.msg,
                
                # Before metrics
                "MI_Before": before_metrics["MI"],
                "CC_Before": before_metrics["CC"],
                "LOC_Before": before_metrics["LOC"],
                
                # After metrics
                "MI_After": after_metrics["MI"],
                "CC_After": after_metrics["CC"],
                "LOC_After": after_metrics["LOC"],
                
                # Changes
                "MI_Change": metric_changes["MI_Change"],
                "CC_Change": metric_changes["CC_Change"],
                "LOC_Change": metric_changes["LOC_Change"],
                
                # Similarity metrics
                "Semantic_Similarity": semantic_sim,
                "Token_Similarity": token_sim,
            }
            
            results.append(entry)
        
        processed += 1
        
        if processed % 10 == 0:
            print(f"Processed {processed} commits...")
    
    # Create DataFrame and save
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    
    print(f"\nAnalysis complete! Saved {len(df)} entries to '{output_file}'")
    return df

# Run comprehensive analysis
analysis_df = comprehensive_code_analysis(repo_url, "code_quality_analysis.csv", limit=50)

## Part F: Classification & Agreement Analysis

Classifying changes as Major/Minor based on similarity thresholds and analyzing agreement.

In [None]:
def classify_changes(df, semantic_threshold=0.7, token_threshold=0.7):
    """Classify changes as Major/Minor based on similarity thresholds."""
    
    # Create classification columns
    df['Semantic_Class'] = df['Semantic_Similarity'].apply(
        lambda x: 'Minor' if x is not None and x >= semantic_threshold else 'Major'
    )
    
    df['Token_Class'] = df['Token_Similarity'].apply(
        lambda x: 'Minor' if x is not None and x >= token_threshold else 'Major'
    )
    
    # Check agreement between classifications
    df['Classes_Agree'] = df.apply(
        lambda row: 'YES' if row['Semantic_Class'] == row['Token_Class'] else 'NO',
        axis=1
    )
    
    return df

# Apply classifications
analysis_df = classify_changes(analysis_df)

# Display classification statistics
print("\n=== Classification Results ===")
print(f"Semantic Classification Distribution:")
print(analysis_df['Semantic_Class'].value_counts())

print(f"\nToken Classification Distribution:")
print(analysis_df['Token_Class'].value_counts())

print(f"\nClassification Agreement:")
print(analysis_df['Classes_Agree'].value_counts())

agreement_rate = (analysis_df['Classes_Agree'] == 'YES').mean() * 100
print(f"\nOverall Agreement Rate: {agreement_rate:.2f}%")

## Visualization and Analysis

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
fig.suptitle('Code Quality and Similarity Analysis', fontsize=16)

# Filter out None values for plotting
valid_data = analysis_df.dropna()

if not valid_data.empty:
    # Maintainability Index plots
    if 'MI_Before' in valid_data.columns:
        axes[0, 0].hist(valid_data['MI_Before'].dropna(), bins=20, alpha=0.7, color='blue')
        axes[0, 0].set_title('MI Before Distribution')
        axes[0, 0].set_xlabel('Maintainability Index')
    
    if 'MI_After' in valid_data.columns:
        axes[0, 1].hist(valid_data['MI_After'].dropna(), bins=20, alpha=0.7, color='green')
        axes[0, 1].set_title('MI After Distribution')
        axes[0, 1].set_xlabel('Maintainability Index')
    
    if 'MI_Change' in valid_data.columns:
        axes[0, 2].hist(valid_data['MI_Change'].dropna(), bins=20, alpha=0.7, color='red')
        axes[0, 2].set_title('MI Change Distribution')
        axes[0, 2].set_xlabel('MI Change')
    
    # Cyclomatic Complexity plots
    if 'CC_Change' in valid_data.columns:
        axes[1, 0].hist(valid_data['CC_Change'].dropna(), bins=20, alpha=0.7, color='purple')
        axes[1, 0].set_title('CC Change Distribution')
        axes[1, 0].set_xlabel('Complexity Change')
    
    # Lines of Code plots
    if 'LOC_Change' in valid_data.columns:
        axes[1, 1].hist(valid_data['LOC_Change'].dropna(), bins=20, alpha=0.7, color='orange')
        axes[1, 1].set_title('LOC Change Distribution')
        axes[1, 1].set_xlabel('LOC Change')
    
    # Similarity plots
    if 'Semantic_Similarity' in valid_data.columns:
        axes[1, 2].hist(valid_data['Semantic_Similarity'].dropna(), bins=20, alpha=0.7, color='cyan')
        axes[1, 2].set_title('Semantic Similarity Distribution')
        axes[1, 2].set_xlabel('Semantic Similarity')
    
    if 'Token_Similarity' in valid_data.columns:
        axes[2, 0].hist(valid_data['Token_Similarity'].dropna(), bins=20, alpha=0.7, color='magenta')
        axes[2, 0].set_title('Token Similarity Distribution')
        axes[2, 0].set_xlabel('Token Similarity')
    
    # Classification agreement
    if 'Classes_Agree' in valid_data.columns:
        agree_counts = valid_data['Classes_Agree'].value_counts()
        axes[2, 1].pie(agree_counts.values, labels=agree_counts.index, autopct='%1.1f%%')
        axes[2, 1].set_title('Classification Agreement')
    
    # Scatter plot: Semantic vs Token Similarity
    if 'Semantic_Similarity' in valid_data.columns and 'Token_Similarity' in valid_data.columns:
        semantic_clean = valid_data['Semantic_Similarity'].dropna()
        token_clean = valid_data['Token_Similarity'].dropna()
        
        # Align the data
        common_idx = semantic_clean.index.intersection(token_clean.index)
        if len(common_idx) > 0:
            axes[2, 2].scatter(semantic_clean[common_idx], token_clean[common_idx], alpha=0.6)
            axes[2, 2].set_xlabel('Semantic Similarity')
            axes[2, 2].set_ylabel('Token Similarity')
            axes[2, 2].set_title('Semantic vs Token Similarity')
            axes[2, 2].plot([0, 1], [0, 1], 'r--', alpha=0.5)  # Diagonal line

else:
    print("No valid data available for plotting")

plt.tight_layout()
plt.show()

In [None]:
# Save final comprehensive dataset
analysis_df.to_csv("final_code_analysis_results.csv", index=False)

# Display summary statistics
print("\n=== Final Analysis Summary ===")
print(f"Total files analyzed: {len(analysis_df)}")
print(f"Files with valid metrics: {analysis_df.dropna().shape[0]}")

# Correlation analysis
numeric_cols = analysis_df.select_dtypes(include=[np.number]).columns
correlation_matrix = analysis_df[numeric_cols].corr()

print(f"\nCorrelations with Semantic Similarity:")
if 'Semantic_Similarity' in correlation_matrix.columns:
    semantic_corr = correlation_matrix['Semantic_Similarity'].sort_values(ascending=False)
    for col, corr in semantic_corr.items():
        if col != 'Semantic_Similarity' and not pd.isna(corr):
            print(f"  {col}: {corr:.3f}")

print(f"\nDataset saved as 'final_code_analysis_results.csv'")

## Results Summary

This notebook provides a comprehensive analysis of code changes including:

### Key Metrics Computed:
1. **Structural Metrics (Radon)**:
   - Maintainability Index (MI)
   - Cyclomatic Complexity (CC)
   - Lines of Code (LOC)

2. **Change Magnitude Metrics**:
   - Semantic Similarity (CodeBERT)
   - Token Similarity (BLEU)

3. **Classification**:
   - Major vs Minor changes based on similarity thresholds
   - Agreement analysis between different classification methods

### Key Datasets Generated:
- `repository_statistics.csv`: Overall repository statistics
- `code_quality_analysis.csv`: Detailed code quality metrics
- `final_code_analysis_results.csv`: Complete analysis with classifications

### Applications:
- Code quality assessment
- Change impact analysis
- Software evolution studies
- Automated code review systems
- Technical debt assessment