<a href="https://colab.research.google.com/github/cray0101/DevvitApps/blob/main/H1B_tournament.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =============================================================================
# H1B VISA ALLOCATION SIMULATOR: "WORLD CUP" MODEL
#
# This notebook implements and visualizes a two-phase, tournament-style system.
# It includes both a system-wide simulation and an individual applicant estimator.
# =============================================================================

# --- SETUP: Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, Layout, VBox, HBox
from IPython.display import display, clear_output

# Set plot style
sns.set_style("whitegrid")

# --- DATA PREPARATION ---

def load_initial_country_data():
    """Loads petition data from FY2019 for top countries."""
    data = {
        'India': 301616, 'China': 51317, 'Canada': 4165, 'South Korea': 3653,
        'Philippines': 3362, 'Mexico': 3109, 'Taiwan': 2707, 'Pakistan': 2147,
        'Brazil': 1888, 'United Kingdom': 1845, 'Nigeria': 1686, 'Nepal': 1675,
        'Japan': 1553, 'France': 1228, 'Iran': 1133, 'Venezuela': 1073,
        'Russia': 945, 'Colombia': 920, 'Germany': 910, 'Australia': 880
    }
    total_petitions = 424223
    data['Other'] = total_petitions - sum(data.values())
    return pd.DataFrame(list(data.items()), columns=['Country', 'Applicants'])

def generate_applicant_profiles(country_df, scholarship_rate=0.05):
    """Generates a detailed applicant DataFrame with simulated wage levels and scholarships."""
    print("Generating detailed applicant profiles... (This may take a moment)")
    wage_dist = {1: 0.35, 2: 0.30, 3: 0.20, 4: 0.15}
    all_applicants = [
        pd.DataFrame({
            'Country': row['Country'],
            'WageLevel': np.random.choice(list(wage_dist.keys()), size=row['Applicants'], p=list(wage_dist.values())),
            'HasScholarship': np.random.choice([False, True], size=row['Applicants'], p=[1 - scholarship_rate, scholarship_rate])
        }) for _, row in country_df.iterrows()
    ]
    print("Applicant profiles generated.")
    return pd.concat(all_applicants, ignore_index=True)

# --- CORE SIMULATION LOGIC ---

def run_world_cup_simulation(alpha, num_qualifiers, wage_weights, scholarship_multiplier, applicants_df, total_visas=85000):
    """Runs the full two-phase, tournament-style simulation."""
    sim_df = applicants_df.copy()
    sim_df['Tickets'] = sim_df['WageLevel'].map(wage_weights)
    sim_df.loc[sim_df['HasScholarship'], 'Tickets'] *= scholarship_multiplier

    # --- Phase 1: Group Stage (National Qualifiers) ---
    country_merit = sim_df.groupby('Country').agg(Applicants=('Country', 'size'), MeritMass=('Tickets', 'sum')).reset_index()
    country_merit['WeightedShare'] = country_merit['MeritMass']**alpha
    total_weighted_share = country_merit['WeightedShare'].sum()
    country_merit['QualifierQuota'] = (country_merit['WeightedShare'] / total_weighted_share) * num_qualifiers
    country_merit['QualifierQuota'] = country_merit['QualifierQuota'].round().astype(int)
    diff = num_qualifiers - country_merit['QualifierQuota'].sum()
    if diff != 0:
        country_merit.loc[country_merit['QualifierQuota'].idxmax(), 'QualifierQuota'] += diff

    qualifiers_df = sim_df.groupby('Country').apply(
        lambda x: x.sample(n=min(len(x), country_merit.set_index('Country').loc[x.name, 'QualifierQuota']), weights='Tickets', random_state=42)
    ).reset_index(drop=True)

    # --- Phase 2: Knockout Stage (Global Finals) ---
    final_winners_df = qualifiers_df.sample(n=min(total_visas, len(qualifiers_df)), weights='Tickets', random_state=42)

    # --- Prepare stats for plotting ---
    qualifier_counts = qualifiers_df['Country'].value_counts().reset_index().rename(columns={'count': 'Qualifiers'})
    winner_counts = final_winners_df['Country'].value_counts().reset_index().rename(columns={'count': 'Winners'})
    plot_stats = pd.merge(qualifier_counts, winner_counts, on='Country', how='outer').fillna(0)

    # Also return qualifiers_df for the estimator
    return final_winners_df, plot_stats, qualifiers_df, country_merit

# --- VISUALIZATION FUNCTIONS ---

def plot_system_results(winners_df, plot_stats_df):
    """Plots the outcomes of the system-wide simulation."""
    fig, axes = plt.subplots(4, 1, figsize=(12, 28))
    stats_sorted_qualifiers = plot_stats_df.sort_values('Qualifiers', ascending=False)
    sns.barplot(data=stats_sorted_qualifiers, x='Qualifiers', y='Country', ax=axes[0], color='#5d6d7e')
    axes[0].set_title('1. Group Stage Outcome: Qualifiers per Country', fontsize=16, loc='left')
    stats_sorted_winners = plot_stats_df.sort_values('Winners', ascending=False)
    sns.barplot(data=stats_sorted_winners, x='Winners', y='Country', ax=axes[1], color='#16a085')
    axes[1].set_title('2. Knockout Stage Outcome: Final Visas per Country', fontsize=16, loc='left')
    plot_stats_melted = plot_stats_df.melt(id_vars='Country', value_vars=['Qualifiers', 'Winners'], var_name='Stage', value_name='Count')
    top_countries = plot_stats_df.sort_values('Qualifiers', ascending=False).head(15)['Country']
    sns.barplot(data=plot_stats_melted[plot_stats_melted['Country'].isin(top_countries)], x='Count', y='Country', hue='Stage', ax=axes[2], palette='magma')
    axes[2].set_title('3. Performance: Qualifiers vs. Winners (Top 15 Countries)', fontsize=16, loc='left')
    sns.countplot(data=winners_df, x='WageLevel', order=[1, 2, 3, 4], palette='plasma', ax=axes[3])
    axes[3].set_title('4. Profile of Final Winners by Wage Level', fontsize=16, loc='left')
    plt.tight_layout(pad=3.0)
    plt.show()

def plot_applicant_chances(country, wage_level, has_scholarship, estimator_data):
    """Calculates and displays an individual's chances for the World Cup model."""
    wage_weights = estimator_data['wage_weights']
    scholarship_multiplier = estimator_data['scholarship_multiplier']
    country_stats = estimator_data['country_stats']
    global_qualifier_stats = estimator_data['global_qualifier_stats']

    my_tickets = wage_weights.get(wage_level, 0)
    if has_scholarship:
        my_tickets *= scholarship_multiplier

    country_info = country_stats.loc[country]
    qualifier_quota = country_info['QualifierQuota']
    total_tickets_in_country = country_info['MeritMass']

    # Calculate Phase 1 Probability
    prob_qualify = (my_tickets / total_tickets_in_country) * qualifier_quota if total_tickets_in_country > 0 else 0

    # Calculate Phase 2 Probability
    total_tickets_of_qualifiers = global_qualifier_stats['TotalTickets']
    total_visas = 85000
    prob_win_final_if_qualify = (my_tickets / total_tickets_of_qualifiers) * total_visas if total_tickets_of_qualifiers > 0 else 0

    # Overall Probability
    overall_chance = prob_qualify * prob_win_final_if_qualify

    print(f"--- Your Applicant Profile ---")
    print(f"Country: {country} | Wage Level: {wage_level} | Scholarship: {'Yes' if has_scholarship else 'No'}")
    print(f"Your Merit Score (Tickets): {my_tickets}")
    print("\n--- Estimated Probabilities ---")
    print(f"1. Chance to Qualify from National 'Group Stage': {prob_qualify:.2%}")
    print(f"2. Chance to Win in Final 'Knockout Stage' (if you qualify): {prob_win_final_if_qualify:.2%}")
    print("---------------------------------")
    print(f"Overall Estimated Chance of Success: {overall_chance:.4f}%") # More precision for small numbers

    # Visualization
    fig, ax = plt.subplots(figsize=(8, 1.5))
    ax.set_xlim(0, max(0.01, overall_chance * 5))
    sns.barplot(x=[overall_chance], y=["Overall Chance"], ax=ax, color='#2ecc71')
    ax.xaxis.set_major_formatter(plt.FuncFormatter('{:.4f}%'.format))
    ax.set_title("Overall Probability Visualization")
    plt.show()


# =============================================================================
# --- MAIN EXECUTION AND INTERACTIVE DASHBOARD ---
# =============================================================================

# Load and prepare data ONCE
country_data = load_initial_country_data()
full_applicants_df = generate_applicant_profiles(country_data)

# --- PART 1: SYSTEM-WIDE SIMULATION DASHBOARD ---

system_ui = VBox([
    widgets.HTML("<h2>Part 1: H1B 'World Cup' System-Wide Simulation</h2>"),
    widgets.FloatSlider(value=0.75, min=0.01, max=1.0, step=0.01, description='Alpha (Diversity/Merit Balance):', style={'description_width': 'initial'}),
    widgets.IntSlider(value=170000, min=85001, max=250000, step=1000, description='No. of Qualifiers:', style={'description_width': 'initial'}),
    widgets.FloatSlider(value=2.0, min=1.0, max=10.0, step=0.5, description='Scholarship Multiplier:', style={'description_width': 'initial'}),
    widgets.HTML("<h4>Prevailing Wage Ticket Weights:</h4>"),
    HBox([widgets.IntText(value=1, description='L1'), widgets.IntText(value=2, description='L2'),
          widgets.IntText(value=4, description='L3'), widgets.IntText(value=8, description='L4')])
])

def run_and_plot_system_dashboard(alpha, num_qualifiers, scholarship_multiplier, l1, l2, l3, l4):
    wage_weights = {1: l1, 2: l2, 3: l3, 4: l4}
    winners, plot_stats, _, _ = run_world_cup_simulation(alpha, num_qualifiers, wage_weights, scholarship_multiplier, full_applicants_df)
    plot_system_results(winners, plot_stats)

system_out = widgets.interactive_output(run_and_plot_system_dashboard, {
    'alpha': system_ui.children[1], 'num_qualifiers': system_ui.children[2],
    'scholarship_multiplier': system_ui.children[3], 'l1': system_ui.children[5].children[0],
    'l2': system_ui.children[5].children[1], 'l3': system_ui.children[5].children[2], 'l4': system_ui.children[5].children[3]
})

# --- PART 2: INDIVIDUAL APPLICANT CHANCE ESTIMATOR ---

# Pre-compute data with default values for the estimator
def setup_estimator_data():
    print("\nPre-computing data for the Applicant Chance Estimator...")
    default_weights = {1: 1, 2: 2, 3: 4, 4: 8}
    _, _, qualifiers, country_stats = run_world_cup_simulation(0.75, 170000, default_weights, 2.0, full_applicants_df)
    estimator_data = {
        'wage_weights': default_weights,
        'scholarship_multiplier': 2.0,
        'country_stats': country_stats.set_index('Country'),
        'global_qualifier_stats': {'TotalTickets': qualifiers['Tickets'].sum()}
    }
    print("Estimator ready.")
    return estimator_data

estimator_data_precomputed = setup_estimator_data()

estimator_ui = VBox([
    widgets.HTML("<h2>Part 2: Individual Applicant Chance Estimator</h2><p>Select a profile to estimate an applicant's chance of success. This calculation is based on a pre-run simulation with default parameters (alpha=0.75, 170k qualifiers, etc.).</p>"),
    widgets.Dropdown(options=country_data['Country'].unique(), value='Canada', description='Country:', style={'description_width': 'initial'}),
    widgets.SelectionSlider(options=[('Level 1', 1), ('Level 2', 2), ('Level 3', 3), ('Level 4', 4)], value=2, description='Wage Level:', style={'description_width': 'initial'}),
    widgets.Checkbox(value=False, description='Sponsored a US Citizen STEM Scholarship?')
])

estimator_out = widgets.interactive_output(
    lambda country, wage_level, has_scholarship: plot_applicant_chances(country, wage_level, has_scholarship, estimator_data_precomputed),
    {'country': estimator_ui.children[1], 'wage_level': estimator_ui.children[2], 'has_scholarship': estimator_ui.children[3]}
)

# --- Display Everything ---
print("\n✅ Simulator is ready! Adjust the controls below to explore the 'World Cup' model.")
display(system_ui, system_out)
display(estimator_ui, estimator_out)



Generating detailed applicant profiles... (This may take a moment)
Applicant profiles generated.

Pre-computing data for the Applicant Chance Estimator...


  qualifiers_df = sim_df.groupby('Country').apply(


Estimator ready.

✅ Simulator is ready! Adjust the controls below to explore the 'World Cup' model.


VBox(children=(HTML(value="<h2>Part 1: H1B 'World Cup' System-Wide Simulation</h2>"), FloatSlider(value=0.75, …

Output()

VBox(children=(HTML(value="<h2>Part 2: Individual Applicant Chance Estimator</h2><p>Select a profile to estima…

Output()