## load and clean data

In [2]:
import pandas as pd
import numpy as np


# Load the dataset
df = pd.read_excel('scn_appeal_cases_data.xlsx')

print("=== RAW DATA OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Check first few rows
print("\n=== FIRST 5 ROWS ===")
print(df.head())

# Check data types
print("\n=== DATA TYPES ===")
print(df.dtypes)

# Check missing values
print("\n=== MISSING VALUES ===")
missing_info = df.isnull().sum()
print(missing_info)

# Check for 'Missing' as a string value
print("\n=== 'Missing' STRING VALUES ===")
for col in df.columns:
    if df[col].dtype == 'object':
        missing_count = (df[col] == 'Missing').sum()
        if missing_count > 0:
            print(f"{col}: {missing_count} 'Missing' values")

# Check for negative values in numeric columns
print("\n=== NEGATIVE VALUES CHECK ===")
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    negative_count = (df[col] < 0).sum()
    if negative_count > 0:
        print(f"{col}: {negative_count} negative values")




=== RAW DATA OVERVIEW ===
Dataset shape: (4696, 14)
Columns: ['appeal_district', 'trial_district', 'offence', 'sentence', 'no_complainant', 'no_male_complainant', 'no_female_complainant', 'no_appealant', 'no_male_appealant', 'no_female_appealant', 'no_public_witness', 'no_eye_witness', 'no_defense_witness', 'scn_decision']

=== FIRST 5 ROWS ===
  appeal_district trial_district              offence          sentence  \
0      North-West     North-West    Rape_sexual_abuse       Prison_term   
1      South-East     South-East        Armed_robbery             Death   
2     South-South    South-South      Damage_property  Prison_term_fine   
3     South-South    South-South  Murder_manslaughter             Death   
4      South-East        Missing              Missing   Payment_damages   

   no_complainant  no_male_complainant  no_female_complainant  no_appealant  \
0               1                    0                      1             1   
1               1                    1      

## Clean the data

In [3]:
print("\n=== STARTING DATA CLEANING ===")
print(f"Original dataset: {len(df)} cases")

# Create a copy for cleaning
df_clean = df.copy()

# 1. Replace 'Missing' strings with NaN
df_clean = df_clean.replace('Missing', np.nan)

# 2. Fix negative values in witness/people count columns
witness_columns = [
    'no_complainant', 'no_male_complainant', 'no_female_complainant',
    'no_appealant', 'no_male_appealant', 'no_female_appealant', 
    'no_public_witness', 'no_eye_witness', 'no_defense_witness'
]

print("\nFixing negative values in witness columns...")
for col in witness_columns:
    negative_count = (df_clean[col] < 0).sum()
    if negative_count > 0:
        print(f"  {col}: Converting {negative_count} negative values to 0")
        df_clean[col] = df_clean[col].apply(lambda x: max(0, x) if pd.notna(x) else 0)

# 3. Check missing values after initial cleaning
print("\n=== MISSING VALUES AFTER CLEANING ===")
missing_after = df_clean.isnull().sum()
for col, missing_count in missing_after.items():
    if missing_count > 0:
        percentage = (missing_count / len(df_clean)) * 100
        print(f"{col}: {missing_count} missing ({percentage:.1f}%)")

# 4. Create model-ready dataset (remove cases with missing critical data)
critical_columns = ['appeal_district', 'offence', 'scn_decision']
print(f"\n=== FILTERING FOR CRITICAL COLUMNS ===")
print(f"Critical columns for modeling: {critical_columns}")

df_model_ready = df_clean.dropna(subset=critical_columns)

print(f"Cases after removing missing critical data: {len(df_model_ready)}")
print(f"Removed: {len(df_clean) - len(df_model_ready)} cases")

# 5. Final data quality check
print("\n=== FINAL DATA QUALITY CHECK ===")
print(f"Final dataset shape: {df_model_ready.shape}")

# Check outcome distribution
print("\nOutcome distribution:")
outcome_counts = df_model_ready['scn_decision'].value_counts()
for outcome, count in outcome_counts.items():
    percentage = (count / len(df_model_ready)) * 100
    print(f"  {outcome}: {count} cases ({percentage:.1f}%)")

# Check regions
print("\nRegions available:")
regions = df_model_ready['appeal_district'].value_counts()
for region, count in regions.items():
    percentage = (count / len(df_model_ready)) * 100
    print(f"  {region}: {count} cases ({percentage:.1f}%)")

# Check offense types
print(f"\nNumber of offense types: {df_model_ready['offence'].nunique()}")
print("Top 10 offense types:")
top_offenses = df_model_ready['offence'].value_counts().head(10)
for offense, count in top_offenses.items():
    print(f"  {offense}: {count} cases")


=== STARTING DATA CLEANING ===
Original dataset: 4696 cases

Fixing negative values in witness columns...
  no_complainant: Converting 2131 negative values to 0
  no_male_complainant: Converting 3371 negative values to 0
  no_female_complainant: Converting 3445 negative values to 0
  no_appealant: Converting 1158 negative values to 0
  no_male_appealant: Converting 2672 negative values to 0
  no_female_appealant: Converting 3083 negative values to 0
  no_public_witness: Converting 1680 negative values to 0
  no_eye_witness: Converting 1711 negative values to 0
  no_defense_witness: Converting 1724 negative values to 0

=== MISSING VALUES AFTER CLEANING ===
appeal_district: 2742 missing (58.4%)
trial_district: 1761 missing (37.5%)
offence: 578 missing (12.3%)
sentence: 2018 missing (43.0%)

=== FILTERING FOR CRITICAL COLUMNS ===
Critical columns for modeling: ['appeal_district', 'offence', 'scn_decision']
Cases after removing missing critical data: 1718
Removed: 2978 cases

=== FINAL D

## save cleaned data

In [4]:
# Save the cleaned dataset
df_model_ready.to_csv('cleaned_supreme_court_data.csv', index=False)
print(f"\n✅ Cleaned data saved to 'cleaned_supreme_court_data.csv'")
print(f"✅ Ready for modeling with {len(df_model_ready)} clean cases")

# Quick summary
print("\n=== CLEANING SUMMARY ===")
print(f"Original cases: {len(df)}")
print(f"After cleaning: {len(df_model_ready)}")
print(f"Success rate: {(df_model_ready['scn_decision'] == 'Granted').mean():.1%}")
print(f"Data quality: {len(df_model_ready)/len(df):.1%} of original data retained")


✅ Cleaned data saved to 'cleaned_supreme_court_data.csv'
✅ Ready for modeling with 1718 clean cases

=== CLEANING SUMMARY ===
Original cases: 4696
After cleaning: 1718
Success rate: 29.5%
Data quality: 36.6% of original data retained


## GENERATE KEY INSIGHTS FOR COMPULAW AI

In [7]:
df_clean = pd.read_csv('cleaned_supreme_court_data.csv')

print("=== COMPULAW AI: LEGAL INTELLIGENCE INSIGHTS ===")
print(f"Analyzing {len(df_clean)} clean Supreme Court cases")

# =====================================================
# INSIGHT 1: SUCCESS RATES BY OFFENSE TYPE
# =====================================================

print("\n🏆 1. BEST PERFORMING OFFENSE TYPES (50+ cases):")
print("=" * 60)

# Calculate success rates by offense
offense_analysis = df_clean.groupby('offence').agg({
    'scn_decision': ['count', lambda x: (x == 'Granted').sum()]
}).round(1)

# Flatten column names
offense_analysis.columns = ['total_cases', 'granted_cases']
offense_analysis['success_rate'] = (offense_analysis['granted_cases'] / offense_analysis['total_cases'] * 100).round(1)

# Filter for meaningful sample sizes and sort by success rate
significant_offenses = offense_analysis[offense_analysis['total_cases'] >= 50].sort_values('success_rate', ascending=False)

print("Offense Type                          | Success Rate | Total Cases")
print("-" * 65)
for offense, data in significant_offenses.head(10).iterrows():
    print(f"{offense:35} | {data['success_rate']}% | {data['total_cases']}")

=== COMPULAW AI: LEGAL INTELLIGENCE INSIGHTS ===
Analyzing 1718 clean Supreme Court cases

🏆 1. BEST PERFORMING OFFENSE TYPES (50+ cases):
Offense Type                          | Success Rate | Total Cases
-----------------------------------------------------------------
Trespassing                         | 38.7% | 93.0
Theft                               | 37.8% | 111.0
Others                              | 32.2% | 152.0
Civil_petition                      | 32.2% | 87.0
Election_petition                   | 30.6% | 62.0
Unlawful_possession                 | 28.3% | 60.0
Murder_manslaughter                 | 28.0% | 225.0
Dispute                             | 27.5% | 495.0
Law_of_tort                         | 27.0% | 63.0
Armed_robbery                       | 24.6% | 122.0


In [8]:
print(f"\n🗺️  2. REGIONAL SUCCESS PATTERNS:")
print("=" * 50)

# Calculate success rates by region
regional_analysis = df_clean.groupby('appeal_district').agg({
    'scn_decision': ['count', lambda x: (x == 'Granted').sum()]
}).round(1)

regional_analysis.columns = ['total_cases', 'granted_cases']
regional_analysis['success_rate'] = (regional_analysis['granted_cases'] / regional_analysis['total_cases'] * 100).round(1)

# Sort by success rate
regional_sorted = regional_analysis.sort_values('success_rate', ascending=False)

print("Region                | Success Rate | Total Cases")
print("-" * 45)
for region, data in regional_sorted.iterrows():
    print(f"{region[:18]:<18} | {data['success_rate']:>8}%   | {data['total_cases']:>9}")


🗺️  2. REGIONAL SUCCESS PATTERNS:
Region                | Success Rate | Total Cases
---------------------------------------------
North-East         |     45.5%   |      11.0
South-South        |     32.8%   |     408.0
North-Central      |     32.0%   |     100.0
South-West         |     30.9%   |     670.0
South-East         |     27.6%   |     232.0
North-West         |     24.2%   |     157.0
FCT                |     19.3%   |     140.0


In [9]:
print(f"\n👁️  3. WITNESS IMPACT ON SUCCESS RATES:")
print("=" * 50)

# Eye witness impact (cases with 20+ occurrences)
eye_witness_analysis = df_clean.groupby('no_eye_witness').agg({
    'scn_decision': ['count', lambda x: (x == 'Granted').sum()]
}).round(1)

eye_witness_analysis.columns = ['total_cases', 'granted_cases']
eye_witness_analysis['success_rate'] = (eye_witness_analysis['granted_cases'] / eye_witness_analysis['total_cases'] * 100).round(1)

# Filter for meaningful sample sizes
significant_witness_counts = eye_witness_analysis[eye_witness_analysis['total_cases'] >= 20].sort_index()

print("Eye Witnesses | Success Rate | Total Cases")
print("-" * 40)
for witnesses, data in significant_witness_counts.iterrows():
    print(f"{witnesses:>11}   | {data['success_rate']:>8}%   | {data['total_cases']:>9}")




👁️  3. WITNESS IMPACT ON SUCCESS RATES:
Eye Witnesses | Success Rate | Total Cases
----------------------------------------
          0   |     29.8%   |    1662.0
          1   |     20.8%   |      24.0


In [10]:
print(f"\n⚖️  4. SUCCESS RATES BY SENTENCE TYPE:")
print("=" * 50)

# Analyze success by sentence type (excluding missing data)
sentence_data = df_clean[df_clean['sentence'].notna()]

sentence_analysis = sentence_data.groupby('sentence').agg({
    'scn_decision': ['count', lambda x: (x == 'Granted').sum()]
}).round(1)

sentence_analysis.columns = ['total_cases', 'granted_cases']
sentence_analysis['success_rate'] = (sentence_analysis['granted_cases'] / sentence_analysis['total_cases'] * 100).round(1)

# Filter for meaningful sample sizes
significant_sentences = sentence_analysis[sentence_analysis['total_cases'] >= 20].sort_values('success_rate', ascending=False)

print("Sentence Type              | Success Rate | Total Cases")
print("-" * 55)
for sentence, data in significant_sentences.head(8).iterrows():
    print(f"{sentence[:25]:<25} | {data['success_rate']:>8}%   | {data['total_cases']:>9}")




⚖️  4. SUCCESS RATES BY SENTENCE TYPE:
Sentence Type              | Success Rate | Total Cases
-------------------------------------------------------
Appeal_granted            |     45.3%   |     137.0
Others                    |     42.4%   |      33.0
Fine                      |     33.6%   |     107.0
Appeal_dismissed          |     30.2%   |      63.0
Prison_term               |     29.1%   |     148.0
Payment_damages           |     24.6%   |     272.0
Death                     |     24.1%   |     266.0


In [11]:
print(f"\n🧠 5. KEY PATTERNS FOR COMPULAW AI:")
print("=" * 50)

# Best offense types for appeals
best_offense = significant_offenses.head(1)
worst_offense = significant_offenses.tail(1)

best_region = regional_sorted.head(1)
worst_region = regional_sorted.tail(1)

print(f"✅ HIGHEST SUCCESS RATE:")
for offense, data in best_offense.iterrows():
    print(f"   Offense: {offense} ({data['success_rate']}% success)")
for region, data in best_region.iterrows():
    print(f"   Region: {region} ({data['success_rate']}% success)")

print(f"\n❌ LOWEST SUCCESS RATE:")
for offense, data in worst_offense.iterrows():
    print(f"   Offense: {offense} ({data['success_rate']}% success)")
for region, data in worst_region.iterrows():
    print(f"   Region: {region} ({data['success_rate']}% success)")

# Witness pattern
eye_witness_pattern = significant_witness_counts['success_rate']
if len(eye_witness_pattern) > 1:
    witness_trend = "increases" if eye_witness_pattern.iloc[-1] > eye_witness_pattern.iloc[0] else "decreases"
    print(f"\n👁️  WITNESS PATTERN: Success rate generally {witness_trend} with more eye witnesses")

# Overall insight
total_cases = len(df_clean)
total_granted = (df_clean['scn_decision'] == 'Granted').sum()
overall_success = (total_granted / total_cases * 100)

print(f"\n📊 OVERALL BASELINE:")
print(f"   Total Cases Analyzed: {total_cases:,}")
print(f"   Overall Success Rate: {overall_success:.1f}%")
print(f"   Cases Granted: {total_granted:,}")
print(f"   Cases Dismissed: {total_cases - total_granted:,}")

print(f"\n✅ INSIGHTS GENERATION COMPLETE!")
print(f"Ready for Step 3: Building the prediction model")


🧠 5. KEY PATTERNS FOR COMPULAW AI:
✅ HIGHEST SUCCESS RATE:
   Offense: Trespassing (38.7% success)
   Region: North-East (45.5% success)

❌ LOWEST SUCCESS RATE:
   Offense: Armed_robbery (24.6% success)
   Region: FCT (19.3% success)

👁️  WITNESS PATTERN: Success rate generally decreases with more eye witnesses

📊 OVERALL BASELINE:
   Total Cases Analyzed: 1,718
   Overall Success Rate: 29.5%
   Cases Granted: 507
   Cases Dismissed: 1,211

✅ INSIGHTS GENERATION COMPLETE!
Ready for Step 3: Building the prediction model


## BUILD THE PREDICTION MODEL

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("=== STEP 3: BUILDING COMPULAW AI PREDICTION MODEL ===")

# Load cleaned data
df_clean = pd.read_csv('cleaned_supreme_court_data.csv')
print(f"Building model with {len(df_clean)} cases")

=== STEP 3: BUILDING COMPULAW AI PREDICTION MODEL ===
Building model with 1718 cases


In [13]:
print("\n1. PREPARING DATA FOR ML...")

# Create label encoders for categorical variables
encoders = {}
df_ml = df_clean.copy()

# Encode categorical features
categorical_features = ['offence', 'appeal_district', 'trial_district', 'sentence']

for feature in categorical_features:
    # Handle missing values first
    df_ml[feature] = df_ml[feature].fillna('Unknown')
    
    # Create and fit encoder
    encoders[feature] = LabelEncoder()
    df_ml[f'{feature}_encoded'] = encoders[feature].fit_transform(df_ml[feature])
    
    print(f"   {feature}: {len(encoders[feature].classes_)} unique values")

# Select features for the model
feature_columns = [
    'offence_encoded', 'appeal_district_encoded', 'trial_district_encoded', 'sentence_encoded',
    'no_complainant', 'no_male_complainant', 'no_female_complainant',
    'no_appealant', 'no_male_appealant', 'no_female_appealant', 
    'no_public_witness', 'no_eye_witness', 'no_defense_witness'
]

# Prepare feature matrix and target
X = df_ml[feature_columns]
y = df_ml['scn_decision']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

# =====================================================
# SPLIT DATA AND TRAIN MODEL
# =====================================================

print("\n2. TRAINING THE MODEL...")

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} cases")
print(f"Test set: {len(X_test)} cases")

# Train Random Forest model
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2
)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

# =====================================================
# EVALUATE MODEL PERFORMANCE
# =====================================================

print("\n3. MODEL PERFORMANCE:")
print("=" * 40)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")

# Show detailed classification report
print("\nDetailed Performance:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print("                Predicted")
print("                Dismissed  Granted")
print(f"Actual Dismissed    {cm[0,0]:>6}    {cm[0,1]:>6}")
print(f"Actual Granted      {cm[1,0]:>6}    {cm[1,1]:>6}")

# =====================================================
# FEATURE IMPORTANCE ANALYSIS
# =====================================================

print("\n4. MOST IMPORTANT FACTORS:")
print("=" * 40)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

# Map encoded features back to readable names
feature_mapping = {
    'offence_encoded': 'Offense Type',
    'appeal_district_encoded': 'Appeal Region', 
    'trial_district_encoded': 'Trial Region',
    'sentence_encoded': 'Original Sentence',
    'no_eye_witness': 'Number of Eye Witnesses',
    'no_public_witness': 'Number of Public Witnesses',
    'no_defense_witness': 'Number of Defense Witnesses',
    'no_complainant': 'Number of Complainants',
    'no_appealant': 'Number of Appellants'
}

print("Factor                          | Importance Score")
print("-" * 50)
for _, row in feature_importance.head(8).iterrows():
    readable_name = feature_mapping.get(row['feature'], row['feature'])
    print(f"{readable_name[:30]:<30} | {row['importance']:.3f}")


1. PREPARING DATA FOR ML...
   offence: 21 unique values
   appeal_district: 7 unique values
   trial_district: 8 unique values
   sentence: 11 unique values

Feature matrix shape: (1718, 13)
Target distribution: {'Dismissed': 1211, 'Granted': 507}

2. TRAINING THE MODEL...
Training set: 1374 cases
Test set: 344 cases

3. MODEL PERFORMANCE:
Overall Accuracy: 0.698 (69.8%)

Detailed Performance:
              precision    recall  f1-score   support

   Dismissed       0.71      0.98      0.82       242
     Granted       0.38      0.03      0.05       102

    accuracy                           0.70       344
   macro avg       0.54      0.50      0.44       344
weighted avg       0.61      0.70      0.59       344


Confusion Matrix:
                Predicted
                Dismissed  Granted
Actual Dismissed       237         5
Actual Granted          99         3

4. MOST IMPORTANT FACTORS:
Factor                          | Importance Score
-----------------------------------------

In [14]:
print("\n5. TESTING WITH EXAMPLE CASES:")
print("=" * 40)

def predict_case_outcome(case_details):
    """Predict outcome for a specific case"""
    
    # Create a single row dataframe
    case_df = pd.DataFrame([case_details])
    
    # Encode categorical features
    for feature in categorical_features:
        if feature in case_df.columns:
            case_df[feature] = case_df[feature].fillna('Unknown')
            try:
                case_df[f'{feature}_encoded'] = encoders[feature].transform(case_df[feature])
            except ValueError:
                # Handle unseen categories
                case_df[f'{feature}_encoded'] = 0
    
    # Select features and predict
    case_features = case_df[feature_columns].fillna(0)
    prediction = model.predict(case_features)[0]
    probability = model.predict_proba(case_features)[0]
    
    # Get confidence score
    confidence = max(probability)
    
    return prediction, confidence

# Test case 1: High-success case type
print("\nExample 1 - Trespassing case (should have higher success chance):")
test_case_1 = {
    'offence': 'Trespassing',
    'appeal_district': 'North-East',
    'trial_district': 'North-East', 
    'sentence': 'Fine',
    'no_complainant': 1,
    'no_male_complainant': 1,
    'no_female_complainant': 0,
    'no_appealant': 1,
    'no_male_appealant': 1,
    'no_female_appealant': 0,
    'no_public_witness': 2,
    'no_eye_witness': 1,
    'no_defense_witness': 2
}

prediction_1, confidence_1 = predict_case_outcome(test_case_1)
print(f"  Predicted: {prediction_1}")
print(f"  Confidence: {confidence_1:.1%}")

# Test case 2: Low-success case type  
print("\nExample 2 - Armed robbery case (should have lower success chance):")
test_case_2 = {
    'offence': 'Armed_robbery',
    'appeal_district': 'FCT',
    'trial_district': 'FCT',
    'sentence': 'Death',
    'no_complainant': 1,
    'no_male_complainant': 1,
    'no_female_complainant': 0,
    'no_appealant': 1,
    'no_male_appealant': 1, 
    'no_female_appealant': 0,
    'no_public_witness': 3,
    'no_eye_witness': 2,
    'no_defense_witness': 0
}

prediction_2, confidence_2 = predict_case_outcome(test_case_2)
print(f"  Predicted: {prediction_2}")
print(f"  Confidence: {confidence_2:.1%}")


5. TESTING WITH EXAMPLE CASES:

Example 1 - Trespassing case (should have higher success chance):
  Predicted: Dismissed
  Confidence: 77.2%

Example 2 - Armed robbery case (should have lower success chance):
  Predicted: Dismissed
  Confidence: 68.4%


In [15]:
import pickle

# Save the trained model and encoders
model_data = {
    'model': model,
    'encoders': encoders,
    'feature_columns': feature_columns,
    'feature_mapping': feature_mapping
}

with open('compulaw_ai_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print(f"\n✅ MODEL TRAINING COMPLETE!")
print(f"✅ Model saved to 'compulaw_ai_model.pkl'")
print(f"✅ Model accuracy: {accuracy:.1%}")
print(f"✅ Ready for Step 4: Building the web interface!")


✅ MODEL TRAINING COMPLETE!
✅ Model saved to 'compulaw_ai_model.pkl'
✅ Model accuracy: 69.8%
✅ Ready for Step 4: Building the web interface!
