In [1]:
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict

class MultiModelPolarizationAnalysisPipeline:
    def __init__(self, base_path="../feature_importance/congress_feature_importance_bigram_100_min_df"):
        self.base_path = base_path
        self.models = ['svm', 'lr', 'bayes']
        self.model_names = {
            'svm': 'Support Vector Machine',
            'lr': 'Logistic Regression', 
            'bayes': 'Complement Naive Bayes'
        }
        
        # Store data for each model
        self.raw_data = {}
        self.aggregated_data = {}
        self.polarization_analysis = {}
    
    def load_all_pickles(self):
        """Load pickle files for all three models"""
        print("Loading pickle files for all models...")
        
        for model in self.models:
            pickle_path = f"{self.base_path}_{model}.pkl"
            try:
                with open(pickle_path, 'rb') as f:
                    self.raw_data[model] = pickle.load(f)
                print(f"✓ Loaded {model.upper()} data")
            except FileNotFoundError:
                print(f"✗ Could not find {pickle_path}")
                continue
        
        # Get congress numbers and seeds from first available model
        if self.raw_data:
            first_model_data = list(self.raw_data.values())[0]
            congress_nums = set()
            seeds = set()
            for key in first_model_data.keys():
                congress_num, seed = key.split('_')
                congress_nums.add(int(congress_num))
                seeds.add(int(seed))
            
            print(f"Data covers {len(congress_nums)} congress sessions: {sorted(congress_nums)}")
            print(f"Using {len(seeds)} seeds: {sorted(seeds)}")
        
        return self.raw_data
    
    def aggregate_all_models(self):
        """Aggregate terms for all models"""
        print("Aggregating coefficients across seeds for all models...")
        
        for model in self.models:
            if model not in self.raw_data:
                continue
                
            congress_term_coefficients = defaultdict(lambda: defaultdict(list))
            
            for key, term_coeffs in self.raw_data[model].items():
                congress_num, seed = key.split('_')
                congress_num = int(congress_num)
                
                for term, coefficient in term_coeffs.items():
                    congress_term_coefficients[congress_num][term].append(coefficient)
            
            # Calculate aggregated statistics
            self.aggregated_data[model] = {}
            for congress_num, terms_dict in congress_term_coefficients.items():
                self.aggregated_data[model][congress_num] = {}
                
                for term, coeff_list in terms_dict.items():
                    mean_coeff = np.mean(coeff_list)
                    self.aggregated_data[model][congress_num][term] = {
                        'mean_coefficient': mean_coeff,
                        'abs_coefficient': abs(mean_coeff),
                        'std_coefficient': np.std(coeff_list),
                        'n_seeds': len(coeff_list)
                    }
        
        return self.aggregated_data
    
    def analyze_partisan_distinctions_all_models(self, top_n=20):
        """Analyze partisan distinctions for all models"""
        print(f"\nAnalyzing top {top_n} partisan distinguishing terms for all models...")
        
        for model in self.models:
            if model not in self.aggregated_data:
                continue
                
            self.polarization_analysis[model] = {}
            
            for congress_num, terms_data in self.aggregated_data[model].items():
                # Sort ALL terms by absolute coefficient
                sorted_terms = sorted(terms_data.items(), 
                                    key=lambda x: x[1]['abs_coefficient'], 
                                    reverse=True)
                
                republican_terms = []
                democrat_terms = []
                
                for term, data in sorted_terms:
                    if data['mean_coefficient'] > 0 and len(republican_terms) < top_n:
                        republican_terms.append({
                            'term': term,
                            'coefficient': data['mean_coefficient'],
                            'abs_coefficient': data['abs_coefficient'],
                            'std': data['std_coefficient']
                        })
                    elif data['mean_coefficient'] < 0 and len(democrat_terms) < top_n:
                        democrat_terms.append({
                            'term': term,
                            'coefficient': data['mean_coefficient'],
                            'abs_coefficient': data['abs_coefficient'],
                            'std': data['std_coefficient']
                        })
                    
                    if len(republican_terms) >= top_n and len(democrat_terms) >= top_n:
                        break
                
                self.polarization_analysis[model][congress_num] = {
                    'republican_terms': republican_terms,
                    'democrat_terms': democrat_terms
                }
        
        return self.polarization_analysis
    
    def print_multi_model_comparison(self, congress_num, top_n=10):
        """Print comparison across all models for a specific congress"""
        print("\n" + "="*100)
        print(f"MULTI-MODEL PARTISAN LANGUAGE COMPARISON - CONGRESS {congress_num}")
        print("="*100)
        
        # Check if congress exists in all models
        available_models = []
        for model in self.models:
            if model in self.polarization_analysis and congress_num in self.polarization_analysis[model]:
                available_models.append(model)
        
        if not available_models:
            print(f"No data available for Congress {congress_num}")
            return
        
        print(f"Available models: {', '.join([self.model_names[m] for m in available_models])}")
        
        # Republican terms comparison
        print(f"\n🔴 TOP {top_n} REPUBLICAN-DISTINGUISHING TERMS:")
        print("-" * 80)
        
        for model in available_models:
            data = self.polarization_analysis[model][congress_num]
            print(f"\n{self.model_names[model].upper()}:")
            for i, term_data in enumerate(data['republican_terms'][:top_n], 1):
                print(f"  {i:2d}. {term_data['term']:<20} (coeff: {term_data['coefficient']:+.3f})")
        
        # Democrat terms comparison
        print(f"\n🔵 TOP {top_n} DEMOCRAT-DISTINGUISHING TERMS:")
        print("-" * 80)
        
        for model in available_models:
            data = self.polarization_analysis[model][congress_num]
            print(f"\n{self.model_names[model].upper()}:")
            for i, term_data in enumerate(data['democrat_terms'][:top_n], 1):
                print(f"  {i:2d}. {term_data['term']:<20} (coeff: {term_data['coefficient']:+.3f})")
    
    def analyze_model_agreement(self, congress_num, top_n=10):
        """Analyze agreement between models for specific congress"""
        print(f"\n📊 MODEL AGREEMENT ANALYSIS - CONGRESS {congress_num}")
        print("="*70)
        
        available_models = []
        for model in self.models:
            if model in self.polarization_analysis and congress_num in self.polarization_analysis[model]:
                available_models.append(model)
        
        if len(available_models) < 2:
            print("Need at least 2 models for agreement analysis")
            return
        
        # Get top terms for each model
        republican_terms_by_model = {}
        democrat_terms_by_model = {}
        
        for model in available_models:
            data = self.polarization_analysis[model][congress_num]
            republican_terms_by_model[model] = set([t['term'] for t in data['republican_terms'][:top_n]])
            democrat_terms_by_model[model] = set([t['term'] for t in data['democrat_terms'][:top_n]])
        
        # Find common terms
        print("\nREPUBLICAN TERMS AGREEMENT:")
        all_rep_terms = set.intersection(*republican_terms_by_model.values())
        print(f"Terms agreed upon by ALL models ({len(all_rep_terms)}): {', '.join(sorted(all_rep_terms)) if all_rep_terms else 'None'}")
        
        # Pairwise agreements
        for i, model1 in enumerate(available_models):
            for model2 in available_models[i+1:]:
                overlap = republican_terms_by_model[model1] & republican_terms_by_model[model2]
                overlap_pct = len(overlap) / top_n * 100
                print(f"{model1.upper()} vs {model2.upper()}: {len(overlap)}/{top_n} terms ({overlap_pct:.1f}% agreement)")
        
        print("\nDEMOCRAT TERMS AGREEMENT:")
        all_dem_terms = set.intersection(*democrat_terms_by_model.values())
        print(f"Terms agreed upon by ALL models ({len(all_dem_terms)}): {', '.join(sorted(all_dem_terms)) if all_dem_terms else 'None'}")
        
        for i, model1 in enumerate(available_models):
            for model2 in available_models[i+1:]:
                overlap = democrat_terms_by_model[model1] & democrat_terms_by_model[model2]
                overlap_pct = len(overlap) / top_n * 100
                print(f"{model1.upper()} vs {model2.upper()}: {len(overlap)}/{top_n} terms ({overlap_pct:.1f}% agreement)")
    
    def create_consensus_ranking(self, congress_num, top_n=10):
        """Create consensus ranking based on all models"""
        print(f"\n🏆 CONSENSUS RANKING - CONGRESS {congress_num}")
        print("="*60)
        
        available_models = []
        for model in self.models:
            if model in self.polarization_analysis and congress_num in self.polarization_analysis[model]:
                available_models.append(model)
        
        if len(available_models) < 2:
            print("Need at least 2 models for consensus ranking")
            return
        
        # Collect all terms with their rankings across models
        term_rankings = defaultdict(lambda: {'republican_ranks': [], 'democrat_ranks': [], 'republican_coeffs': [], 'democrat_coeffs': []})
        
        for model in available_models:
            data = self.polarization_analysis[model][congress_num]
            
            # Republican terms
            for rank, term_data in enumerate(data['republican_terms'][:top_n], 1):
                term = term_data['term']
                term_rankings[term]['republican_ranks'].append(rank)
                term_rankings[term]['republican_coeffs'].append(term_data['coefficient'])
            
            # Democrat terms  
            for rank, term_data in enumerate(data['democrat_terms'][:top_n], 1):
                term = term_data['term']
                term_rankings[term]['democrat_ranks'].append(rank)
                term_rankings[term]['democrat_coeffs'].append(term_data['coefficient'])
        
        # Calculate consensus scores for Republican terms
        republican_consensus = []
        for term, rankings in term_rankings.items():
            if rankings['republican_ranks']:
                avg_rank = np.mean(rankings['republican_ranks'])
                avg_coeff = np.mean(rankings['republican_coeffs'])
                n_models = len(rankings['republican_ranks'])
                # Consensus score: lower average rank is better, weight by number of models
                consensus_score = (top_n + 1 - avg_rank) * n_models
                republican_consensus.append({
                    'term': term,
                    'avg_rank': avg_rank,
                    'avg_coeff': avg_coeff,
                    'n_models': n_models,
                    'consensus_score': consensus_score
                })
        
        # Calculate consensus scores for Democrat terms
        democrat_consensus = []
        for term, rankings in term_rankings.items():
            if rankings['democrat_ranks']:
                avg_rank = np.mean(rankings['democrat_ranks'])
                avg_coeff = np.mean(rankings['democrat_coeffs'])
                n_models = len(rankings['democrat_ranks'])
                consensus_score = (top_n + 1 - avg_rank) * n_models
                democrat_consensus.append({
                    'term': term,
                    'avg_rank': avg_rank,
                    'avg_coeff': avg_coeff,
                    'n_models': n_models,
                    'consensus_score': consensus_score
                })
        
        # Sort by consensus score
        republican_consensus.sort(key=lambda x: x['consensus_score'], reverse=True)
        democrat_consensus.sort(key=lambda x: x['consensus_score'], reverse=True)
        
        print(f"\nTOP CONSENSUS REPUBLICAN TERMS:")
        for i, term_data in enumerate(republican_consensus[:top_n], 1):
            print(f"  {i:2d}. {term_data['term']:<20} (avg_coeff: {term_data['avg_coeff']:+.3f}, "
                  f"avg_rank: {term_data['avg_rank']:.1f}, models: {term_data['n_models']}/{len(available_models)})")
        
        print(f"\nTOP CONSENSUS DEMOCRAT TERMS:")
        for i, term_data in enumerate(democrat_consensus[:top_n], 1):
            print(f"  {i:2d}. {term_data['term']:<20} (avg_coeff: {term_data['avg_coeff']:+.3f}, "
                  f"avg_rank: {term_data['avg_rank']:.1f}, models: {term_data['n_models']}/{len(available_models)})")
        
        return republican_consensus, democrat_consensus
    
    def run_multi_model_analysis(self, congress_nums=None, top_n=10):
        """Run complete multi-model analysis"""
        print("RUNNING MULTI-MODEL CONGRESSIONAL POLARIZATION ANALYSIS")
        print("="*80)
        
        # Load and aggregate data
        self.load_all_pickles()
        self.aggregate_all_models()
        self.analyze_partisan_distinctions_all_models(top_n=20)
        
        # Get available congress numbers
        if congress_nums is None:
            available_congress = set()
            for model_data in self.polarization_analysis.values():
                available_congress.update(model_data.keys())
            congress_nums = sorted(list(available_congress))
        
        # Run analysis for each congress
        for congress_num in congress_nums:
            self.print_multi_model_comparison(congress_num, top_n)
            self.analyze_model_agreement(congress_num, top_n)
            self.create_consensus_ranking(congress_num, top_n)
            print("\n" + "="*100 + "\n")
        
        return self.polarization_analysis

# Usage example
def main():
    # Initialize pipeline
    pipeline = MultiModelPolarizationAnalysisPipeline()
    
    # Run analysis for specific congress (e.g., 76) or all available
    analysis = pipeline.run_multi_model_analysis(congress_nums=[76], top_n=10)
    
    # You can also run for multiple congresses:
    # analysis = pipeline.run_multi_model_analysis(congress_nums=[76, 77, 78], top_n=10)
    
    # Or for all available congresses:
    # analysis = pipeline.run_multi_model_analysis(top_n=10)
    
    return pipeline, analysis

if __name__ == "__main__":
    pipeline, analysis = main()

RUNNING MULTI-MODEL CONGRESSIONAL POLARIZATION ANALYSIS
Loading pickle files for all models...
✓ Loaded SVM data


ModuleNotFoundError: No module named 'cupy'