# 3. Advanced GRI Analysis

This notebook demonstrates advanced analysis techniques for understanding representativeness gaps and comparing surveys using the Global Representativeness Index.

## Overview

Beyond basic GRI scores, this analysis helps you:
1. **Identify specific over/under-represented groups**
2. **Understand the magnitude of representativeness gaps**
3. **Compare surveys over time or across methodologies**
4. **Generate actionable insights for improving sample balance**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from typing import List, Tuple

# Add the gri module to the path
sys.path.append('..')
from gri.calculator import calculate_gri, calculate_diversity_score
from gri.utils import load_data

# Set plotting style
plt.style.use('default')
sns.set_palette('RdYlBu_r')

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 20)

## 1. Load Data and Previous Results

In [None]:
# Load data
benchmark_age_gender = load_data('../data/processed/benchmark_country_gender_age.csv')
benchmark_religion = load_data('../data/processed/benchmark_country_religion.csv')
benchmark_environment = load_data('../data/processed/benchmark_country_environment.csv')
survey_data = load_data('../data/processed/sample_survey_data.csv')

# Load previous GRI results
import json
with open('../data/processed/gri_results.json', 'r') as f:
    gri_results = json.load(f)

print("Data loaded successfully")
print(f"Survey participants: {len(survey_data)}")
print(f"Previous Average GRI: {gri_results['average_gri']:.4f}")

## 2. Detailed Gap Analysis Functions

In [None]:
def analyze_representativeness_gaps(survey_df: pd.DataFrame, benchmark_df: pd.DataFrame, 
                                   strata_cols: List[str], top_n: int = 10) -> pd.DataFrame:
    """
    Analyze representativeness gaps by comparing sample vs benchmark proportions.
    
    Returns DataFrame with detailed gap analysis.
    """
    # Calculate sample proportions
    sample_counts = survey_df.groupby(strata_cols).size().reset_index(name='sample_count')
    total_participants = len(survey_df)
    sample_counts['sample_proportion'] = sample_counts['sample_count'] / total_participants
    
    # Merge with benchmark
    merged = pd.merge(benchmark_df, sample_counts, on=strata_cols, how='outer')
    merged['sample_proportion'] = merged['sample_proportion'].fillna(0)
    merged['sample_count'] = merged['sample_count'].fillna(0)
    
    # Calculate gaps
    merged['proportion_gap'] = merged['sample_proportion'] - merged['population_proportion']
    merged['absolute_gap'] = abs(merged['proportion_gap'])
    merged['representation_ratio'] = merged['sample_proportion'] / merged['population_proportion']
    merged['representation_ratio'] = merged['representation_ratio'].replace([np.inf, -np.inf], np.nan)
    
    # Add interpretation
    conditions = [
        (merged['sample_proportion'] == 0) & (merged['population_proportion'] > 0),
        (merged['representation_ratio'] < 0.5),
        (merged['representation_ratio'] < 0.8),
        (merged['representation_ratio'] > 2.0),
        (merged['representation_ratio'] > 1.25),
    ]
    choices = [
        'Missing',
        'Severely Under-represented',
        'Under-represented', 
        'Over-represented',
        'Moderately Over-represented'
    ]
    merged['representation_status'] = np.select(conditions, choices, default='Well-represented')
    
    # Sort by absolute gap (largest gaps first)
    merged = merged.sort_values('absolute_gap', ascending=False)
    
    return merged


def create_stratum_label(row: pd.Series, strata_cols: List[str]) -> str:
    """Create a readable label for a demographic stratum."""
    parts = []
    for col in strata_cols:
        value = str(row[col])
        if col == 'country':
            # Abbreviate long country names
            if len(value) > 15:
                value = value[:12] + "..."
        elif col == 'religion':
            # Abbreviate long religion names
            if 'not identify' in value:
                value = "No religion"
            elif len(value) > 20:
                value = value[:17] + "..."
        parts.append(value)
    return ' | '.join(parts)

print("Gap analysis functions defined")

## 3. Country × Gender × Age Gap Analysis

In [None]:
# Analyze gaps for Country × Gender × Age
age_gender_gaps = analyze_representativeness_gaps(
    survey_data, 
    benchmark_age_gender, 
    ['country', 'gender', 'age_group']
)

print("=== COUNTRY × GENDER × AGE REPRESENTATIVENESS GAPS ===")
print(f"Total strata analyzed: {len(age_gender_gaps)}")
print(f"Strata with sample representation: {(age_gender_gaps['sample_count'] > 0).sum()}")

# Show top over-represented groups
over_represented = age_gender_gaps[
    age_gender_gaps['representation_status'].isin(['Over-represented', 'Moderately Over-represented'])
].head(5)

print("\nTOP 5 OVER-REPRESENTED GROUPS:")
for _, row in over_represented.iterrows():
    label = create_stratum_label(row, ['country', 'gender', 'age_group'])
    print(f"  {label}:")
    print(f"    Sample: {row['sample_proportion']:.4f}, Expected: {row['population_proportion']:.4f}")
    print(f"    Ratio: {row['representation_ratio']:.2f}x, Gap: +{row['proportion_gap']:.4f}")
    print()

# Show top under-represented groups
under_represented = age_gender_gaps[
    age_gender_gaps['representation_status'].isin(['Under-represented', 'Severely Under-represented', 'Missing'])
].head(5)

print("TOP 5 UNDER-REPRESENTED GROUPS:")
for _, row in under_represented.iterrows():
    label = create_stratum_label(row, ['country', 'gender', 'age_group'])
    print(f"  {label}:")
    print(f"    Sample: {row['sample_proportion']:.4f}, Expected: {row['population_proportion']:.4f}")
    if row['sample_proportion'] > 0:
        print(f"    Ratio: {row['representation_ratio']:.2f}x, Gap: {row['proportion_gap']:.4f}")
    else:
        print(f"    Status: Missing from sample")
    print()

## 4. Country × Religion Gap Analysis

In [None]:
# Analyze gaps for Country × Religion
religion_gaps = analyze_representativeness_gaps(
    survey_data,
    benchmark_religion,
    ['country', 'religion']
)

print("=== COUNTRY × RELIGION REPRESENTATIVENESS GAPS ===")
print(f"Total strata analyzed: {len(religion_gaps)}")
print(f"Strata with sample representation: {(religion_gaps['sample_count'] > 0).sum()}")

# Show significant over-represented groups
religion_over = religion_gaps[
    religion_gaps['representation_status'].isin(['Over-represented', 'Moderately Over-represented'])
].head(3)

print("\nTOP 3 OVER-REPRESENTED RELIGIOUS GROUPS:")
for _, row in religion_over.iterrows():
    label = create_stratum_label(row, ['country', 'religion'])
    print(f"  {label}:")
    print(f"    Sample: {row['sample_proportion']:.4f}, Expected: {row['population_proportion']:.4f}")
    print(f"    Ratio: {row['representation_ratio']:.2f}x")
    print()

# Show significant under-represented groups
religion_under = religion_gaps[
    religion_gaps['representation_status'].isin(['Under-represented', 'Severely Under-represented', 'Missing'])
].head(3)

print("TOP 3 UNDER-REPRESENTED RELIGIOUS GROUPS:")
for _, row in religion_under.iterrows():
    label = create_stratum_label(row, ['country', 'religion'])
    print(f"  {label}:")
    print(f"    Sample: {row['sample_proportion']:.4f}, Expected: {row['population_proportion']:.4f}")
    if row['sample_proportion'] > 0:
        print(f"    Ratio: {row['representation_ratio']:.2f}x")
    else:
        print(f"    Status: Missing from sample")
    print()

## 5. Visualization of Representativeness Gaps

In [None]:
# Create visualization of the largest gaps
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Age/Gender gaps (top 10)
top_age_gaps = age_gender_gaps.head(10)
gap_labels = [create_stratum_label(row, ['country', 'gender', 'age_group'])[:30] 
              for _, row in top_age_gaps.iterrows()]
bars1 = axes[0,0].barh(range(len(top_age_gaps)), top_age_gaps['proportion_gap'], 
                       color=['red' if x > 0 else 'blue' for x in top_age_gaps['proportion_gap']])
axes[0,0].set_yticks(range(len(top_age_gaps)))
axes[0,0].set_yticklabels(gap_labels)
axes[0,0].set_xlabel('Proportion Gap (Sample - Expected)')
axes[0,0].set_title('Top 10 Age/Gender Representativeness Gaps')
axes[0,0].axvline(x=0, color='black', linestyle='-', alpha=0.5)
axes[0,0].grid(axis='x', alpha=0.3)

# Plot 2: Religion gaps (top 8)
top_religion_gaps = religion_gaps.head(8)
religion_labels = [create_stratum_label(row, ['country', 'religion'])[:35] 
                   for _, row in top_religion_gaps.iterrows()]
bars2 = axes[0,1].barh(range(len(top_religion_gaps)), top_religion_gaps['proportion_gap'],
                       color=['red' if x > 0 else 'blue' for x in top_religion_gaps['proportion_gap']])
axes[0,1].set_yticks(range(len(top_religion_gaps)))
axes[0,1].set_yticklabels(religion_labels)
axes[0,1].set_xlabel('Proportion Gap (Sample - Expected)')
axes[0,1].set_title('Top 8 Religion Representativeness Gaps')
axes[0,1].axvline(x=0, color='black', linestyle='-', alpha=0.5)
axes[0,1].grid(axis='x', alpha=0.3)

# Plot 3: Distribution of representation ratios (Age/Gender)
valid_ratios = age_gender_gaps[age_gender_gaps['representation_ratio'].notna() & 
                              (age_gender_gaps['sample_proportion'] > 0)]['representation_ratio']
axes[1,0].hist(valid_ratios, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[1,0].axvline(x=1.0, color='red', linestyle='--', label='Perfect representation')
axes[1,0].set_xlabel('Representation Ratio (Sample/Expected)')
axes[1,0].set_ylabel('Number of Strata')
axes[1,0].set_title('Distribution of Age/Gender Representation Ratios')
axes[1,0].legend()
axes[1,0].set_xlim(0, 5)

# Plot 4: Sample vs Expected scatter (Religion, significant strata only)
religion_significant = religion_gaps[religion_gaps['population_proportion'] > 0.001]  # Only significant strata
scatter = axes[1,1].scatter(religion_significant['population_proportion'], 
                           religion_significant['sample_proportion'],
                           alpha=0.6, s=60)
# Add perfect representation line
max_val = max(religion_significant['population_proportion'].max(), 
              religion_significant['sample_proportion'].max())
axes[1,1].plot([0, max_val], [0, max_val], 'r--', alpha=0.8, label='Perfect representation')
axes[1,1].set_xlabel('Expected Proportion (Benchmark)')
axes[1,1].set_ylabel('Sample Proportion')
axes[1,1].set_title('Sample vs Expected: Religion (Major Strata)')
axes[1,1].legend()
axes[1,1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Create a Second Survey for Comparison

In [None]:
# Create a second survey with different characteristics for comparison
print("Creating comparison survey with different demographic balance...")

np.random.seed(123)  # Different seed for different results
n_participants_2 = 750  # Larger sample

# Create a more skewed sample (more Western, younger, urban)
skewed_countries = ['United States', 'Germany', 'Japan', 'Brazil']
skewed_ages = ['18-25', '26-35', '36-45']  # Younger skew
skewed_genders = ['Male', 'Female']
skewed_religions = ['Christianity', 'I do not identify with any religious group or faith', 'Other religious group']
skewed_environments = ['Urban']  # Heavily urban

# Create weights to make it more skewed
country_weights = [0.4, 0.3, 0.2, 0.1]  # US-heavy
age_weights = [0.5, 0.3, 0.2]  # Young-heavy
env_weights = [1.0]  # All urban

survey_data_2 = pd.DataFrame({
    'country': np.random.choice(skewed_countries, n_participants_2, p=country_weights),
    'age_group': np.random.choice(skewed_ages, n_participants_2, p=age_weights),
    'gender': np.random.choice(skewed_genders, n_participants_2),
    'religion': np.random.choice(skewed_religions, n_participants_2),
    'environment': np.random.choice(skewed_environments, n_participants_2, p=env_weights)
})

print(f"Comparison survey created with {len(survey_data_2)} participants")
print("\nCountry distribution (Survey 2):")
print(survey_data_2['country'].value_counts())
print("\nAge distribution (Survey 2):")
print(survey_data_2['age_group'].value_counts())

## 7. Survey Comparison Analysis

In [None]:
# Calculate GRI for both surveys across all dimensions
def calculate_full_gri_scorecard(survey_df: pd.DataFrame) -> dict:
    """Calculate complete GRI scorecard for a survey."""
    results = {}
    
    # Country × Gender × Age
    gri_age_gender = calculate_gri(survey_df, benchmark_age_gender, ['country', 'gender', 'age_group'])
    div_age_gender = calculate_diversity_score(survey_df, benchmark_age_gender, ['country', 'gender', 'age_group'])
    
    # Country × Religion
    gri_religion = calculate_gri(survey_df, benchmark_religion, ['country', 'religion'])
    div_religion = calculate_diversity_score(survey_df, benchmark_religion, ['country', 'religion'])
    
    # Country × Environment
    gri_environment = calculate_gri(survey_df, benchmark_environment, ['country', 'environment'])
    div_environment = calculate_diversity_score(survey_df, benchmark_environment, ['country', 'environment'])
    
    results = {
        'participants': len(survey_df),
        'gri_age_gender': gri_age_gender,
        'gri_religion': gri_religion,
        'gri_environment': gri_environment,
        'average_gri': np.mean([gri_age_gender, gri_religion, gri_environment]),
        'div_age_gender': div_age_gender,
        'div_religion': div_religion,
        'div_environment': div_environment,
        'average_diversity': np.mean([div_age_gender, div_religion, div_environment])
    }
    
    return results

# Calculate scorecards for both surveys
scorecard_1 = calculate_full_gri_scorecard(survey_data)
scorecard_2 = calculate_full_gri_scorecard(survey_data_2)

print("=== SURVEY COMPARISON ===")
print(f"\nSurvey 1 (Balanced Sample):")
print(f"  Participants: {scorecard_1['participants']}")
print(f"  Average GRI: {scorecard_1['average_gri']:.4f}")
print(f"  Average Diversity: {scorecard_1['average_diversity']:.4f}")

print(f"\nSurvey 2 (Skewed Sample):")
print(f"  Participants: {scorecard_2['participants']}")
print(f"  Average GRI: {scorecard_2['average_gri']:.4f}")
print(f"  Average Diversity: {scorecard_2['average_diversity']:.4f}")

print(f"\nDifference (Survey 1 - Survey 2):")
print(f"  GRI Difference: {scorecard_1['average_gri'] - scorecard_2['average_gri']:+.4f}")
print(f"  Diversity Difference: {scorecard_1['average_diversity'] - scorecard_2['average_diversity']:+.4f}")

## 8. Detailed Comparison Visualization

In [None]:
# Create detailed comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Comparison of GRI scores by dimension
dimensions = ['Age/Gender', 'Religion', 'Environment']
gri_scores_1 = [scorecard_1['gri_age_gender'], scorecard_1['gri_religion'], scorecard_1['gri_environment']]
gri_scores_2 = [scorecard_2['gri_age_gender'], scorecard_2['gri_religion'], scorecard_2['gri_environment']]

x = np.arange(len(dimensions))
width = 0.35

bars1 = axes[0,0].bar(x - width/2, gri_scores_1, width, label='Survey 1 (Balanced)', alpha=0.8)
bars2 = axes[0,0].bar(x + width/2, gri_scores_2, width, label='Survey 2 (Skewed)', alpha=0.8)

axes[0,0].set_xlabel('Dimension')
axes[0,0].set_ylabel('GRI Score')
axes[0,0].set_title('GRI Scores Comparison by Dimension')
axes[0,0].set_xticks(x)
axes[0,0].set_xticklabels(dimensions)
axes[0,0].legend()
axes[0,0].set_ylim(0, 1)
axes[0,0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[0,0].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                      f'{height:.3f}', ha='center', va='bottom', fontsize=9)

# Comparison of Diversity scores
div_scores_1 = [scorecard_1['div_age_gender'], scorecard_1['div_religion'], scorecard_1['div_environment']]
div_scores_2 = [scorecard_2['div_age_gender'], scorecard_2['div_religion'], scorecard_2['div_environment']]

bars3 = axes[0,1].bar(x - width/2, div_scores_1, width, label='Survey 1 (Balanced)', alpha=0.8)
bars4 = axes[0,1].bar(x + width/2, div_scores_2, width, label='Survey 2 (Skewed)', alpha=0.8)

axes[0,1].set_xlabel('Dimension')
axes[0,1].set_ylabel('Diversity Score')
axes[0,1].set_title('Diversity Scores Comparison by Dimension')
axes[0,1].set_xticks(x)
axes[0,1].set_xticklabels(dimensions)
axes[0,1].legend()
axes[0,1].set_ylim(0, 1)
axes[0,1].grid(axis='y', alpha=0.3)

# Add value labels
for bars in [bars3, bars4]:
    for bar in bars:
        height = bar.get_height()
        axes[0,1].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                      f'{height:.3f}', ha='center', va='bottom', fontsize=9)

# Country distribution comparison
country_dist_1 = survey_data['country'].value_counts(normalize=True)
country_dist_2 = survey_data_2['country'].value_counts(normalize=True)

all_countries = list(set(country_dist_1.index) | set(country_dist_2.index))
props_1 = [country_dist_1.get(c, 0) for c in all_countries]
props_2 = [country_dist_2.get(c, 0) for c in all_countries]

x_countries = np.arange(len(all_countries))
axes[1,0].bar(x_countries - width/2, props_1, width, label='Survey 1', alpha=0.8)
axes[1,0].bar(x_countries + width/2, props_2, width, label='Survey 2', alpha=0.8)
axes[1,0].set_xlabel('Country')
axes[1,0].set_ylabel('Proportion')
axes[1,0].set_title('Country Distribution Comparison')
axes[1,0].set_xticks(x_countries)
axes[1,0].set_xticklabels([c[:10] for c in all_countries], rotation=45, ha='right')
axes[1,0].legend()
axes[1,0].grid(axis='y', alpha=0.3)

# Age distribution comparison
age_dist_1 = survey_data['age_group'].value_counts(normalize=True)
age_dist_2 = survey_data_2['age_group'].value_counts(normalize=True)

all_ages = sorted(list(set(age_dist_1.index) | set(age_dist_2.index)))
age_props_1 = [age_dist_1.get(a, 0) for a in all_ages]
age_props_2 = [age_dist_2.get(a, 0) for a in all_ages]

x_ages = np.arange(len(all_ages))
axes[1,1].bar(x_ages - width/2, age_props_1, width, label='Survey 1', alpha=0.8)
axes[1,1].bar(x_ages + width/2, age_props_2, width, label='Survey 2', alpha=0.8)
axes[1,1].set_xlabel('Age Group')
axes[1,1].set_ylabel('Proportion')
axes[1,1].set_title('Age Distribution Comparison')
axes[1,1].set_xticks(x_ages)
axes[1,1].set_xticklabels(all_ages)
axes[1,1].legend()
axes[1,1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Actionable Recommendations

In [None]:
# Generate actionable recommendations based on gap analysis
def generate_recommendations(gaps_df: pd.DataFrame, dimension_name: str, top_n: int = 3) -> List[str]:
    """Generate actionable recommendations for improving representativeness."""
    recommendations = []
    
    # Find most under-represented groups
    under_rep = gaps_df[
        gaps_df['representation_status'].isin(['Under-represented', 'Severely Under-represented', 'Missing'])
    ].head(top_n)
    
    for _, row in under_rep.iterrows():
        if row['sample_count'] == 0:
            recommendation = f"🎯 Target recruitment in {dimension_name}: Add any representation from "
        else:
            target_increase = (row['population_proportion'] - row['sample_proportion']) * 1000  # per 1000 participants
            recommendation = f"🎯 Target recruitment in {dimension_name}: Increase "
        
        # Add specific group details
        if 'country' in gaps_df.columns:
            recommendation += f"{row['country']}"
        if 'gender' in gaps_df.columns:
            recommendation += f" {row['gender']}"
        if 'age_group' in gaps_df.columns:
            recommendation += f" {row['age_group']}"
        if 'religion' in gaps_df.columns:
            recommendation += f" {row['religion'][:30]}"
        if 'environment' in gaps_df.columns:
            recommendation += f" {row['environment']}"
            
        if row['sample_count'] > 0:
            recommendation += f" by ~{target_increase:.0f} per 1000 participants"
        
        recommendations.append(recommendation)
    
    return recommendations

print("=== ACTIONABLE RECOMMENDATIONS FOR IMPROVING REPRESENTATIVENESS ===")

# Age/Gender recommendations
age_recommendations = generate_recommendations(age_gender_gaps, "Country × Age × Gender")
print("\n📊 DEMOGRAPHIC BALANCE IMPROVEMENTS:")
for i, rec in enumerate(age_recommendations, 1):
    print(f"{i}. {rec}")

# Religion recommendations
religion_recommendations = generate_recommendations(religion_gaps, "Country × Religion")
print("\n🕊️ RELIGIOUS DIVERSITY IMPROVEMENTS:")
for i, rec in enumerate(religion_recommendations, 1):
    print(f"{i}. {rec}")

# General strategic recommendations
print("\n🚀 STRATEGIC RECOMMENDATIONS:")
if scorecard_1['average_gri'] < 0.6:
    print("1. 🎯 Focus on geographic diversification - expand recruitment beyond current primary regions")
    print("2. 📢 Use targeted advertising in under-represented demographics")
    print("3. 🤝 Partner with local organizations in under-represented communities")
elif scorecard_1['average_gri'] < 0.8:
    print("1. 🔧 Fine-tune recruitment to address specific demographic gaps identified above")
    print("2. 📊 Monitor recruitment metrics in real-time to course-correct during data collection")
else:
    print("1. ✅ Maintain current recruitment strategy - representativeness is already strong")
    print("2. 🔍 Focus on quality control and data validation")

if scorecard_1['average_diversity'] < 0.7:
    print("4. 🌍 Expand to new geographic regions to improve diversity coverage")
    print("5. 📱 Use multiple recruitment channels (online, offline, mobile) to reach diverse populations")

## 10. Export Advanced Analysis Results

In [None]:
# Export detailed gap analysis results
os.makedirs('../data/processed/advanced_analysis', exist_ok=True)

# Export gap analysis tables
age_gender_gaps.to_csv('../data/processed/advanced_analysis/age_gender_gaps.csv', index=False)
religion_gaps.to_csv('../data/processed/advanced_analysis/religion_gaps.csv', index=False)

# Export survey comparison
comparison_df = pd.DataFrame({
    'Metric': ['Participants', 'GRI Age/Gender', 'GRI Religion', 'GRI Environment', 'Average GRI',
               'Diversity Age/Gender', 'Diversity Religion', 'Diversity Environment', 'Average Diversity'],
    'Survey_1_Balanced': [scorecard_1['participants'], scorecard_1['gri_age_gender'], 
                         scorecard_1['gri_religion'], scorecard_1['gri_environment'], scorecard_1['average_gri'],
                         scorecard_1['div_age_gender'], scorecard_1['div_religion'], 
                         scorecard_1['div_environment'], scorecard_1['average_diversity']],
    'Survey_2_Skewed': [scorecard_2['participants'], scorecard_2['gri_age_gender'], 
                       scorecard_2['gri_religion'], scorecard_2['gri_environment'], scorecard_2['average_gri'],
                       scorecard_2['div_age_gender'], scorecard_2['div_religion'], 
                       scorecard_2['div_environment'], scorecard_2['average_diversity']]
})
comparison_df.to_csv('../data/processed/advanced_analysis/survey_comparison.csv', index=False)

# Export recommendations
all_recommendations = age_recommendations + religion_recommendations
recommendations_df = pd.DataFrame({
    'Recommendation': all_recommendations,
    'Priority': ['High'] * len(age_recommendations) + ['Medium'] * len(religion_recommendations),
    'Category': ['Demographic Balance'] * len(age_recommendations) + ['Religious Diversity'] * len(religion_recommendations)
})
recommendations_df.to_csv('../data/processed/advanced_analysis/recommendations.csv', index=False)

print("Advanced analysis results exported:")
print("  - data/processed/advanced_analysis/age_gender_gaps.csv")
print("  - data/processed/advanced_analysis/religion_gaps.csv")
print("  - data/processed/advanced_analysis/survey_comparison.csv")
print("  - data/processed/advanced_analysis/recommendations.csv")

## Summary

This advanced analysis has provided:

### 🔍 **Detailed Gap Analysis**
- Identified specific over/under-represented demographic groups
- Quantified the magnitude of representativeness gaps
- Analyzed representation ratios and missing strata

### 📊 **Survey Comparison**
- Compared two different survey methodologies
- Demonstrated how sampling strategies affect GRI scores
- Showed the impact of demographic skew on representativeness

### 🎯 **Actionable Insights**
- Generated specific recruitment recommendations
- Prioritized improvements by impact on overall GRI
- Provided strategic guidance for future data collection

### 📈 **Key Findings**
- Survey 1 (Balanced): Average GRI = {scorecard_1['average_gri']:.3f}
- Survey 2 (Skewed): Average GRI = {scorecard_2['average_gri']:.3f}
- **Impact of skewed sampling**: {scorecard_1['average_gri'] - scorecard_2['average_gri']:+.3f} GRI points

### 📁 **Exported Files**
All detailed analysis results have been saved to `data/processed/advanced_analysis/` for further use in reporting and decision-making.

This analysis demonstrates the power of GRI for not just measuring representativeness, but for providing actionable insights to improve survey quality and global representativeness.