# Gender Perspective Analysis

This notebook analyzes gender perspective data from Carmen de Areco's transparency portal, focusing on gender-inclusive budgeting and policy implementation.

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import os
from pathlib import Path
import glob

# Set up data paths
data_paths = [
    Path("../data/processed"),
    Path("../data/consolidated"),
    Path("../data/cleaned"),
    Path("../frontend/public/data/csv"),
    Path("../../data/processed"),
    Path("../../data/consolidated"),
    Path("../../frontend/public/data/charts")
]

# Find all gender perspective related CSV files
all_gender_files = []
for path in data_paths:
    if path.exists():
        all_gender_files.extend(list(path.rglob("*gender*csv")))
        all_gender_files.extend(list(path.rglob("*genero*csv")))
        all_gender_files.extend(list(path.rglob("*perspectiva*csv")))
        all_gender_files.extend(list(path.rglob("*personal*csv")))

print(f"Found {len(all_gender_files)} gender perspective files")

In [None]:
# Load gender perspective data
df = None

# Look for consolidated gender perspective data first
for file_path in all_gender_files:
    if 'consolidated' in str(file_path) or 'gender' in str(file_path).lower() or 'genero' in str(file_path).lower():
        try:
            df = pd.read_csv(file_path)
            print(f"Loaded data from: {file_path}")
            break
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue

# If no consolidated data found, use the first available file
if df is None and all_gender_files:
    try:
        df = pd.read_csv(all_gender_files[0])
        print(f"Loaded data from: {all_gender_files[0]}")
    except Exception as e:
        print(f"Error loading {all_gender_files[0]}: {e}")

# If no files found, create sample data
if df is None:
    print("No gender perspective data found, creating sample data for demonstration")
    
    # Sample data for gender perspective analysis
    data = {
        'year': [2019, 2020, 2021, 2022, 2023],
        'gender_inclusive_budget': [100000, 120000, 150000, 180000, 200000],
        'total_budget': [1000000, 1100000, 1200000, 1300000, 1400000],
        'female_beneficiaries': [500, 550, 600, 650, 700],
        'male_beneficiaries': [450, 500, 550, 600, 650],
        'female_staff': [45, 47, 49, 51, 53],
        'male_staff': [55, 53, 51, 49, 47]
    }
    df = pd.DataFrame(data)
    
# Standardize column names
column_mapping = {
    'Año': 'year',
    'anio': 'year',
    'Ejercicio': 'year',
    'Presupuesto con Perspectiva de Género': 'gender_inclusive_budget',
    'Presupuesto Total': 'total_budget',
    'Beneficiarias Mujeres': 'female_beneficiaries',
    'Beneficiarios Hombres': 'male_beneficiaries',
    'Personal Mujeres': 'female_staff',
    'Personal Hombres': 'male_staff',
    'Mujeres': 'female',
    'Hombres': 'male',
    'ENE': 'january',
    'FEB': 'february',
    'MAR': 'march',
    'ABR': 'april',
    'MAY': 'may',
    'JUN': 'june',
    'JUL': 'july',
    'AGO': 'august',
    'SEP': 'september',
    'OCT': 'october',
    'NOV': 'november',
    'DIC': 'december'
}

df = df.rename(columns=column_mapping)

# Convert columns to appropriate types
numeric_cols = ['gender_inclusive_budget', 'total_budget', 'female_beneficiaries', 
                'male_beneficiaries', 'female_staff', 'male_staff']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

if 'year' in df.columns:
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

print(f"Dataset shape: {df.shape}")
print(df.head())
print(f"\nColumns: {list(df.columns)}")

In [None]:
# Calculate gender inclusive budget percentage
if 'gender_inclusive_budget' in df.columns and 'total_budget' in df.columns and 'total_budget' in df.columns:
    df['gender_inclusive_percentage'] = np.where(df['total_budget'] != 0, 
                                                 (df['gender_inclusive_budget'] / df['total_budget']) * 100, 
                                                 0)

# Calculate total beneficiaries
if 'female_beneficiaries' in df.columns and 'male_beneficiaries' in df.columns:
    df['total_beneficiaries'] = df['female_beneficiaries'] + df['male_beneficiaries']

# Calculate gender distribution
if 'total_beneficiaries' in df.columns and 'female_beneficiaries' in df.columns:
    df['female_percentage'] = np.where(df['total_beneficiaries'] != 0, 
                                       (df['female_beneficiaries'] / df['total_beneficiaries']) * 100, 
                                       0)
    df['male_percentage'] = 100 - df['female_percentage']

# Display the updated dataframe
print(df.head())

In [None]:
# Create a grouped bar chart showing gender inclusive budget over time
if 'year' in df.columns and 'gender_inclusive_budget' in df.columns and 'total_budget' in df.columns:
    fig_inclusive_budget = px.bar(df, 
                                  x='year', 
                                  y=['gender_inclusive_budget', 'total_budget'], 
                                  title="Gender Inclusive Budget vs Total Budget Over Time",
                                  barmode='group',
                                  labels={'value': 'Amount (ARS)', 'variable': 'Type'},
                                  color_discrete_map={'gender_inclusive_budget': 'purple', 'total_budget': 'blue'})

    # Add value labels on bars
    fig_inclusive_budget.update_traces(texttemplate='%{y:,.0f}', textposition='outside')

    fig_inclusive_budget.update_layout(
        yaxis_title="Amount (ARS)",
        xaxis_title="Year",
        uniformtext_minsize=8,
        uniformtext_mode='hide',
        legend_title="Type"
    )

    fig_inclusive_budget.show()
else:
    print("Required columns for gender inclusive budget chart not found")

In [None]:
# Create a line chart showing gender inclusive budget percentage trends
if 'year' in df.columns and 'gender_inclusive_percentage' in df.columns:
    fig_trend = px.line(df, 
                        x='year', 
                        y='gender_inclusive_percentage',
                        title="Gender Inclusive Budget Percentage Over Time",
                        labels={'gender_inclusive_percentage': 'Percentage of Total Budget (%)'},
                        markers=True)

    fig_trend.update_layout(
        yaxis_title="Percentage of Total Budget (%)",
        xaxis_title="Year",
        yaxis_range=[0, max(df['gender_inclusive_percentage']) * 1.1]
    )

    fig_trend.show()
else:
    print("Required columns for gender inclusive percentage trend chart not found")

In [None]:
# Create a stacked bar chart showing gender distribution of beneficiaries
if 'year' in df.columns and 'female_beneficiaries' in df.columns and 'male_beneficiaries' in df.columns:
    # Melt the dataframe for easier plotting
    beneficiary_df = df[['year', 'female_beneficiaries', 'male_beneficiaries']].copy()
    beneficiary_df = pd.melt(beneficiary_df, id_vars=['year'], 
                             value_vars=['female_beneficiaries', 'male_beneficiaries'],
                             var_name='gender', value_name='count')
    beneficiary_df['gender'] = beneficiary_df['gender'].str.replace('_beneficiaries', '').str.title()

    fig_beneficiaries = px.bar(beneficiary_df, 
                               x='year', 
                               y='count', 
                               color='gender',
                               title="Beneficiaries by Gender and Year",
                               barmode='stack',
                               labels={'count': 'Number of Beneficiaries', 'year': 'Year'},
                               color_discrete_map={'Female': 'pink', 'Male': 'lightblue'})

    fig_beneficiaries.update_layout(
        yaxis_title="Number of Beneficiaries",
        xaxis_title="Year",
        legend_title="Gender"
    )

    fig_beneficiaries.show()
else:
    print("Required columns for beneficiaries chart not found")

In [None]:
# Create a heatmap showing gender inclusive budget percentage by year
if 'year' in df.columns and 'gender_inclusive_percentage' in df.columns:
    heatmap_df = df.set_index('year')[['gender_inclusive_percentage']]

    fig_heatmap = px.imshow(heatmap_df.T.values,
                            x=heatmap_df.index,
                            y=['Gender Inclusive Budget %'],
                            color_continuous_scale='viridis',
                            title="Heatmap: Gender Inclusive Budget Percentage by Year",
                            labels={'x': 'Year', 'y': 'Metric', 'color': 'Percentage (%)'})

    fig_heatmap.update_layout(
        xaxis_title="Year",
        yaxis_title="Metric"
    )

    fig_heatmap.show()
else:
    print("Required columns for heatmap not found")

In [None]:
# Create a pie chart showing distribution of gender inclusive budget by year
if 'year' in df.columns and 'gender_inclusive_budget' in df.columns:
    fig_pie = px.pie(df, 
                     values='gender_inclusive_budget', 
                     names='year',
                     title="Distribution of Gender Inclusive Budget by Year",
                     hover_data=['gender_inclusive_budget'],
                     labels={'gender_inclusive_budget':'Amount (ARS)'})

    fig_pie.update_traces(textposition='inside', textinfo='percent+label')

    fig_pie.show()
    
    # Create another pie chart showing gender distribution of total beneficiaries
    if 'total_beneficiaries' in df.columns and 'female_beneficiaries' in df.columns:
        total_female = df['female_beneficiaries'].sum()
        total_male = df['male_beneficiaries'].sum() if 'male_beneficiaries' in df.columns else (df['total_beneficiaries'].sum() - total_female)

        gender_dist_df = pd.DataFrame({
            'gender': ['Female', 'Male'],
            'count': [total_female, total_male]
        })

        fig_gender_pie = px.pie(gender_dist_df, 
                                values='count', 
                                names='gender',
                                title="Overall Gender Distribution of Beneficiaries",
                                hover_data=['count'],
                                labels={'count':'Number of Beneficiaries'},
                                color_discrete_sequence=['pink', 'lightblue'])

        fig_gender_pie.update_traces(textposition='inside', textinfo='percent+label')

        fig_gender_pie.show()
else:
    print("Required columns for pie charts not found")

In [None]:
# Create a dual axis chart showing beneficiaries and percentages
if 'year' in df.columns and 'total_beneficiaries' in df.columns and 'female_percentage' in df.columns:
    fig_dual = go.Figure()

    # Add total beneficiaries
    fig_dual.add_trace(
        go.Bar(
            x=df['year'],
            y=df['total_beneficiaries'],
            name='Total Beneficiaries',
            marker_color='blue'
        )
    )

    # Add female percentage as line
    fig_dual.add_trace(
        go.Scatter(
            x=df['year'],
            y=df['female_percentage'],
            name='Female %',
            yaxis='y2',
            mode='lines+markers',
            line=dict(color='red', width=3)
        )
    )

    fig_dual.update_layout(
        title="Total Beneficiaries and Female Percentage Over Time",
        xaxis_title="Year",
        yaxis_title="Total Beneficiaries",
        yaxis2=dict(
            title="Female Percentage (%)",
            overlaying='y',
            side='right',
            range=[0, 100]
        ),
        legend_title="Metric"
    )

    fig_dual.show()
else:
    print("Required columns for dual axis chart not found")

In [None]:
# Summary statistics
print("Summary Statistics:")
if 'gender_inclusive_budget' in df.columns:
    print(f"- Total Gender Inclusive Budget: {df['gender_inclusive_budget'].sum():,.2f} ARS")
if 'total_budget' in df.columns and 'gender_inclusive_budget' in df.columns:
    overall_percentage = (df['gender_inclusive_budget'].sum() / df['total_budget'].sum()) * 100
    print(f"- Percentage of Total Budget: {overall_percentage:.2f}%")
if 'female_beneficiaries' in df.columns:
    print(f"- Total Female Beneficiaries: {df['female_beneficiaries'].sum():,}")
if 'male_beneficiaries' in df.columns:
    print(f"- Total Male Beneficiaries: {df['male_beneficiaries'].sum():,}")

# Calculate gender distribution if possible
if 'female_beneficiaries' in df.columns and 'total_beneficiaries' in df.columns:
    total_beneficiaries = df['total_beneficiaries'].sum()
    female_share = (df['female_beneficiaries'].sum() / total_beneficiaries) * 100 if total_beneficiaries != 0 else 0
    male_share = 100 - female_share

    print(f"\nGender Distribution of Beneficiaries:")
    print(f"  Female: {female_share:.2f}%")
    print(f"  Male: {male_share:.2f}%")

# Year-over-year analysis if possible
if 'year' in df.columns and 'gender_inclusive_budget' in df.columns:
    print(f"\nYear-over-Year Analysis:")
    df_sorted = df.sort_values('year')
    if len(df_sorted) > 1:
        for i in range(1, len(df_sorted)):
            prev_row = df_sorted.iloc[i-1]
            curr_row = df_sorted.iloc[i]
            if 'gender_inclusive_budget' in prev_row and prev_row['gender_inclusive_budget'] != 0:
                growth_rate = ((curr_row['gender_inclusive_budget'] - prev_row['gender_inclusive_budget']) / 
                              prev_row['gender_inclusive_budget']) * 100
                print(f"  {prev_row['year']} to {curr_row['year']}: {growth_rate:+.2f}%")

# Data completeness report
print(f"\nData Completeness Report:")
print(f"- Total records: {len(df)}")
for col in df.columns:
    non_null_count = df[col].notna().sum()
    completeness = (non_null_count / len(df)) * 100 if len(df) > 0 else 0
    print(f"- {col}: {completeness:.1f}% complete ({non_null_count}/{len(df)} records)")