In [1]:
"""
Test script to evaluate institution lookup service performance
Tests first 300 institutions and calculates accuracy metrics
"""
import pandas as pd
import time
from datetime import datetime
from dotenv import load_dotenv
from services.institution_lookup_service import InstitutionLookupService
from database.queries import QueryService
import json

# Load environment variables
load_dotenv()

class LookupTester:
    def __init__(self, limit=300):
        self.limit = limit
        self.query_service = QueryService()
        
        # Load all institutions to get valid countries
        print("Loading institution table to extract valid countries...")
        all_institutions = self.query_service.get_table_data('institution', limit=None)
        
        # Extract unique country values
        valid_countries = set()
        if 'country_sub' in all_institutions.columns:
            valid_countries.update(all_institutions['country_sub'].dropna().unique())
        if 'country_parent' in all_institutions.columns:
            valid_countries.update(all_institutions['country_parent'].dropna().unique())
        
        print(f"Found {len(valid_countries)} unique countries in institution table")
        
        self.lookup_service = InstitutionLookupService(valid_countries=list(valid_countries))
        self.results = []
        
    def load_institutions(self):
        """Load institutions from database"""
        print(f"Loading first {self.limit} institutions from database...")
        df = self.query_service.get_table_data('institution', limit=self.limit)
        
        # Select relevant columns
        columns = [
            'institution_cpi',
            'institution_type_layer1',
            'institution_type_layer2', 
            'institution_type_layer3',
            'country_sub',
            'country_parent'
        ]
        
        df = df[columns]
        print(f"Loaded {len(df)} institutions")
        return df
    
    def test_institution(self, row):
        """Test lookup for a single institution"""
        institution_name = row['institution_cpi']
        
        print(f"\nTesting: {institution_name}")
        
        try:
            # Perform lookup
            start_time = time.time()
            result = self.lookup_service.lookup_institution(institution_name)
            elapsed_time = time.time() - start_time
            
            # Compare results
            comparison = {
                'institution_name': institution_name,
                'lookup_time_seconds': round(elapsed_time, 2),
                'confidence_score': result.confidence_score,
                
                # Actual values from database
                'actual_type1': row['institution_type_layer1'],
                'actual_type2': row['institution_type_layer2'],
                'actual_type3': row['institution_type_layer3'],
                'actual_country_parent': row['country_parent'],
                'actual_country_sub': row['country_sub'],
                
                # Predicted values from lookup
                'predicted_type1': result.institution_type_layer1,
                'predicted_type2': result.institution_type_layer2,
                'predicted_type3': result.institution_type_layer3,
                'predicted_country_parent': result.parent_country,
                'predicted_country_sub': result.subsidiary_country,
                
                # Match status
                'type1_match': None,
                'type2_match': None,
                'type3_match': None,
                'country_parent_match': None,
                'country_sub_match': None,
                
                'reasoning': result.reasoning,
                'num_sources': len(result.sources)
            }
            
            # Calculate matches (handling None/null values)
            comparison['type1_match'] = self._compare_values(
                row['institution_type_layer1'], 
                result.institution_type_layer1
            )
            comparison['type2_match'] = self._compare_values(
                row['institution_type_layer2'], 
                result.institution_type_layer2
            )
            comparison['type3_match'] = self._compare_values(
                row['institution_type_layer3'], 
                result.institution_type_layer3
            )
            comparison['country_parent_match'] = self._compare_values(
                row['country_parent'], 
                result.parent_country
            )
            comparison['country_sub_match'] = self._compare_values(
                row['country_sub'], 
                result.subsidiary_country
            )
            
            print(f"  Confidence: {result.confidence_score:.2f}")
            print(f"  Type1 Match: {comparison['type1_match']}")
            print(f"  Type2 Match: {comparison['type2_match']}")
            print(f"  Type3 Match: {comparison['type3_match']}")
            print(f"  Country Parent Match: {comparison['country_parent_match']}")
            print(f"  Country Sub Match: {comparison['country_sub_match']}")
            
            return comparison
            
        except Exception as e:
            print(f"  ERROR: {str(e)}")
            return {
                'institution_name': institution_name,
                'error': str(e),
                'lookup_time_seconds': 0,
                'confidence_score': 0.0
            }
    
    def _compare_values(self, actual, predicted):
        """
        Compare actual vs predicted values
        Returns: 'match', 'mismatch', 'no_prediction', 'no_actual', or 'both_null'
        """
        # Normalize values
        actual_clean = self._clean_value(actual)
        predicted_clean = self._clean_value(predicted)
        
        # Both are null/empty
        if not actual_clean and not predicted_clean:
            return 'both_null'
        
        # No actual value in database
        if not actual_clean:
            return 'no_actual'
        
        # No predicted value from lookup
        if not predicted_clean:
            return 'no_prediction'
        
        # Compare values (case-insensitive)
        if actual_clean.lower() == predicted_clean.lower():
            return 'match'
        else:
            return 'mismatch'
    
    def _clean_value(self, value):
        """Clean and normalize a value"""
        if pd.isna(value) or value is None or value == '':
            return None
        return str(value).strip()
    
    def run_test(self, sample_size=None):
        """Run the full test"""
        print("=" * 80)
        print("INSTITUTION LOOKUP SERVICE - PERFORMANCE TEST")
        print("=" * 80)
        print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print()
        
        # Load institutions
        institutions_df = self.load_institutions()
        
        # Optionally sample
        if sample_size and sample_size < len(institutions_df):
            print(f"\nSampling {sample_size} institutions for testing...")
            institutions_df = institutions_df.sample(n=sample_size, random_state=42)
        
        total = len(institutions_df)
        print(f"\nTesting {total} institutions...")
        print("=" * 80)
        
        # Test each institution
        for idx, row in institutions_df.iterrows():
            print(f"\nProgress: {len(self.results) + 1}/{total}")
            result = self.test_institution(row)
            self.results.append(result)
            
            # Be nice to APIs - small delay between requests
            time.sleep(0.5)
        
        print("\n" + "=" * 80)
        print("TEST COMPLETE")
        print("=" * 80)
        
        # Calculate and display statistics
        self.calculate_statistics()
        
        # Save results
        self.save_results()
    
    def calculate_statistics(self):
        """Calculate and display performance statistics"""
        results_df = pd.DataFrame(self.results)
        
        print("\n" + "=" * 80)
        print("PERFORMANCE STATISTICS")
        print("=" * 80)
        
        # Overall statistics
        print(f"\nTotal Institutions Tested: {len(results_df)}")
        print(f"Average Confidence Score: {results_df['confidence_score'].mean():.2%}")
        print(f"Median Confidence Score: {results_df['confidence_score'].median():.2%}")
        print(f"Average Lookup Time: {results_df['lookup_time_seconds'].mean():.2f}s")
        
        # Confidence distribution
        print("\nConfidence Score Distribution:")
        print(f"  High (>= 0.7): {(results_df['confidence_score'] >= 0.7).sum()} ({(results_df['confidence_score'] >= 0.7).sum() / len(results_df) * 100:.1f}%)")
        print(f"  Medium (0.4-0.7): {((results_df['confidence_score'] >= 0.4) & (results_df['confidence_score'] < 0.7)).sum()} ({((results_df['confidence_score'] >= 0.4) & (results_df['confidence_score'] < 0.7)).sum() / len(results_df) * 100:.1f}%)")
        print(f"  Low (< 0.4): {(results_df['confidence_score'] < 0.4).sum()} ({(results_df['confidence_score'] < 0.4).sum() / len(results_df) * 100:.1f}%)")
        
        # Field-by-field accuracy
        print("\n" + "-" * 80)
        print("FIELD-BY-FIELD ACCURACY")
        print("-" * 80)
        
        fields = [
            ('institution_type_layer1', 'type1_match'),
            ('institution_type_layer2', 'type2_match'),
            ('institution_type_layer3', 'type3_match'),
            ('country_parent', 'country_parent_match'),
            ('country_sub', 'country_sub_match')
        ]
        
        for field_name, match_column in fields:
            print(f"\n{field_name.upper()}:")
            
            # Count match types
            match_counts = results_df[match_column].value_counts()
            total_valid = len(results_df)
            
            matches = match_counts.get('match', 0)
            mismatches = match_counts.get('mismatch', 0)
            no_prediction = match_counts.get('no_prediction', 0)
            no_actual = match_counts.get('no_actual', 0)
            both_null = match_counts.get('both_null', 0)
            
            print(f"  Correct Matches: {matches} ({matches/total_valid*100:.1f}%)")
            print(f"  Mismatches: {mismatches} ({mismatches/total_valid*100:.1f}%)")
            print(f"  No Prediction: {no_prediction} ({no_prediction/total_valid*100:.1f}%)")
            print(f"  No Actual Value: {no_actual} ({no_actual/total_valid*100:.1f}%)")
            print(f"  Both Null: {both_null} ({both_null/total_valid*100:.1f}%)")
            
            # Calculate accuracy (matches / cases where both had values)
            cases_with_both = matches + mismatches
            if cases_with_both > 0:
                accuracy = matches / cases_with_both * 100
                print(f"  --> Accuracy (when both have values): {accuracy:.1f}%")
            
            # Calculate coverage (predictions / cases with actual values)
            cases_with_actual = matches + mismatches + no_prediction
            if cases_with_actual > 0:
                coverage = (matches + mismatches) / cases_with_actual * 100
                print(f"  --> Coverage (predicted when actual exists): {coverage:.1f}%")
        
        # High confidence subset analysis
        high_conf_df = results_df[results_df['confidence_score'] >= 0.7]
        if len(high_conf_df) > 0:
            print("\n" + "-" * 80)
            print(f"HIGH CONFIDENCE SUBSET (>= 0.7 confidence, n={len(high_conf_df)})")
            print("-" * 80)
            
            for field_name, match_column in fields:
                match_counts = high_conf_df[match_column].value_counts()
                matches = match_counts.get('match', 0)
                mismatches = match_counts.get('mismatch', 0)
                cases_with_both = matches + mismatches
                
                if cases_with_both > 0:
                    accuracy = matches / cases_with_both * 100
                    print(f"  {field_name}: {accuracy:.1f}% accuracy")
        
        print("\n" + "=" * 80)
    
    def save_results(self):
        """Save results to CSV files"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Save detailed results
        results_df = pd.DataFrame(self.results)
        filename = f'lookup_test_results_{timestamp}.csv'
        results_df.to_csv(filename, index=False)
        print(f"\nDetailed results saved to: {filename}")
        
        # Save summary statistics
        summary = {
            'test_date': datetime.now().isoformat(),
            'total_tested': len(results_df),
            'avg_confidence': float(results_df['confidence_score'].mean()),
            'median_confidence': float(results_df['confidence_score'].median()),
            'avg_lookup_time': float(results_df['lookup_time_seconds'].mean()),
        }
        
        # Add field accuracies
        fields = [
            ('type1', 'type1_match'),
            ('type2', 'type2_match'),
            ('type3', 'type3_match'),
            ('country_parent', 'country_parent_match'),
            ('country_sub', 'country_sub_match')
        ]
        
        for field_name, match_column in fields:
            match_counts = results_df[match_column].value_counts()
            matches = match_counts.get('match', 0)
            mismatches = match_counts.get('mismatch', 0)
            cases_with_both = matches + mismatches
            
            if cases_with_both > 0:
                summary[f'{field_name}_accuracy'] = float(matches / cases_with_both)
            else:
                summary[f'{field_name}_accuracy'] = None
        
        summary_filename = f'lookup_test_summary_{timestamp}.json'
        with open(summary_filename, 'w') as f:
            json.dump(summary, f, indent=2)
        print(f"Summary statistics saved to: {summary_filename}")


def main():
    """Main function"""
    print("Starting Institution Lookup Performance Test")
    print()
    
    # Create tester
    tester = LookupTester(limit=200)
    
    # Run test (you can pass sample_size to test fewer institutions first)
    # tester.run_test(sample_size=10)  # Test with 10 first
    tester.run_test()  # Test all 300
    
    print("\nTest complete!")


if __name__ == "__main__":
    main()

Starting Institution Lookup Performance Test

Loading institution table to extract valid countries...
Found 265 unique countries in institution table
Loaded 265 valid countries from institution table
INSTITUTION LOOKUP SERVICE - PERFORMANCE TEST
Started: 2025-10-14 15:41:26

Loading first 200 institutions from database...
Loaded 200 institutions

Testing 200 institutions...

Progress: 1/200

Testing: 100% RE IPP GmbH & Co KG
  Confidence: 0.90
  Type1 Match: match
  Type2 Match: match
  Type3 Match: match
  Country Parent Match: match
  Country Sub Match: match

Progress: 2/200

Testing: 123Venture SA
  Confidence: 0.90
  Type1 Match: match
  Type2 Match: mismatch
  Type3 Match: match
  Country Parent Match: match
  Country Sub Match: match

Progress: 3/200

Testing: 127 Energy LLC
  Confidence: 0.90
  Type1 Match: match
  Type2 Match: match
  Type3 Match: no_actual
  Country Parent Match: match
  Country Sub Match: match

Progress: 4/200

Testing: 174 Power Global Corp
  Confidence: 0