In [13]:
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# 1. Setup Export Directory
ASSETS_DIR = "../assets"
os.makedirs(ASSETS_DIR, exist_ok=True)

# Load the Track 1 checkpoint (Behavioral Data)
df_behavior = pd.read_csv("../data/processed/speed_dating_behavior.csv")

# ==========================================
# 1. Global Match Rate
# ==========================================

# Map gender for clearer visual labels (0 = Female, 1 = Male based on dataset key)
df_cleaned['gender_label'] = df_cleaned['gender'].map({0: 'Female', 1: 'Male'})
df_cleaned['match_label'] = df_cleaned['match'].map({0: 'No Match', 1: 'Match'})

# Global Match Rate Analysis
match_counts = df_cleaned['match_label'].value_counts().reset_index()
match_counts.columns = ['Match Status', 'Count']

fig_global = px.pie(
    match_counts, 
    names='Match Status', 
    values='Count', 
    title='Global Match Rate in Speed Dating Experiment',
    color='Match Status',
    color_discrete_map={'No Match': '#EF553B', 'Match': '#00CC96'},
    hole=0.4
)
fig_global.update_traces(textposition='inside', textinfo='percent+label')
fig_global.show()
fig_global.write_html(os.path.join(ASSETS_DIR, "B01-pie_chart-Global_Match_Rate.html"))

# ==========================================
# 2. Match Rate by Gender
# ==========================================

# Calculate the percentage of matches within each gender group
gender_match_stats = df_cleaned.groupby(['gender_label', 'match_label']).size().reset_index(name='Count')
gender_totals = df_cleaned.groupby('gender_label').size().reset_index(name='Total')
gender_match_stats = pd.merge(gender_match_stats, gender_totals, on='gender_label')
gender_match_stats['Percentage'] = (gender_match_stats['Count'] / gender_match_stats['Total']) * 100

fig_gender = px.bar(
    gender_match_stats, 
    x='gender_label', 
    y='Percentage', 
    color='match_label',
    title='Match Distribution by Gender',
    labels={'gender_label': 'Gender', 'Percentage': 'Percentage of Total Dates (%)'},
    color_discrete_map={'No Match': '#EF553B', 'Match': '#00CC96'},
    barmode='stack',
    text=gender_match_stats['Percentage'].apply(lambda x: f'{x:.1f}%')
)
fig_gender.update_layout(xaxis_title="Gender", yaxis_title="Percentage (%)")
fig_gender.show()
fig_gender.write_html(os.path.join(ASSETS_DIR, "B02-bar_chart-Match_Rate_by_Gender.html"))

# ==========================================
# 3. Individual Decision rates by Gender
# ==========================================

# Map the individual decision for clearer visual labels (0 = No, 1 = Yes)
df_cleaned['dec_label'] = df_behavior['dec'].map({0: 'No', 1: 'Yes'})

# Calculate individual decision rates grouped by gender
dec_stats = df_cleaned.groupby(['gender_label', 'dec_label']).size().reset_index(name='Count')
gender_totals_dec = df_cleaned.groupby('gender_label').size().reset_index(name='Total')

# Merge to calculate percentages
dec_stats = pd.merge(dec_stats, gender_totals_dec, on='gender_label')
dec_stats['Percentage'] = (dec_stats['Count'] / dec_stats['Total']) * 100

# Visualize the decision distribution by gender
fig_dec = px.bar(
    dec_stats, 
    x='gender_label', 
    y='Percentage', 
    color='dec_label',
    title='Individual Decision Rates by Gender ("Swipe Right")',
    labels={
        'gender_label': 'Gender', 
        'Percentage': 'Percentage of Decisions (%)', 
        'dec_label': 'Decision'
    },
    color_discrete_map={'No': '#EF553B', 'Yes': '#00CC96'},
    barmode='stack',
    text=dec_stats['Percentage'].apply(lambda x: f'{x:.1f}%')
)

fig_dec.update_layout(xaxis_title="Gender", yaxis_title="Percentage (%)")
fig_dec.show()
fig_dec.write_html(os.path.join(ASSETS_DIR, "B04-bar_chart-Individual_Decision_rates_by_Gender.html"))


### Visualisation Interpretations: Target Variable & Individual Decisions
* **Global Match Rate (B01 & B02):** The global match rate is highly imbalanced, with mutual matches occurring in less than 20% of the speed dates. This confirms that finding mutual interest is a rare event, regardless of gender.
* **Behavioral Differences / "Swipe Right" Asymmetry (B03):** While mutual matches are naturally equal for both genders, individual behavior is highly asymmetrical. Males express interest ("Yes") at a significantly higher rate than females. In a Tinder context, men act as the primary drivers of potential match volume, while female selectivity acts as the bottleneck for the global match rate.

In [14]:
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# 1. Setup Export Directory
ASSETS_DIR = "../assets"
os.makedirs(ASSETS_DIR, exist_ok=True)

# Load the Track 1 checkpoint (Behavioral Data)
df_behavior = pd.read_csv("../data/processed/speed_dating_behavior.csv")

# ==========================================
# 4. Impact of Partner Attributes (Correlation)
# ==========================================

# Define the in-the-moment rating attributes
attributes = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']

# Map gender for readability if not already mapped
if 'gender_label' not in df_behavior.columns:
    df_behavior['gender_label'] = df_behavior['gender'].map({0: 'Female', 1: 'Male'})

# Calculate correlations between attributes and the decision ('dec') for each gender
correlations = []
for gender in ['Female', 'Male']:
    subset = df_behavior[df_behavior['gender_label'] == gender]
    for attr in attributes:
        # Calculate Pearson correlation between the attribute rating and the binary decision
        corr = subset[attr].corr(subset['dec'])
        correlations.append({
            'Gender': gender,
            'Attribute': attr.upper(),
            'Correlation with Decision': corr
        })

corr_df = pd.DataFrame(correlations)

# Sort attributes to ensure a clean, sorted visualization
# Sorting by the overall average correlation across both genders
order = corr_df.groupby('Attribute')['Correlation with Decision'].mean().sort_values(ascending=False).index
corr_df['Attribute'] = pd.Categorical(corr_df['Attribute'], categories=order, ordered=True)
corr_df = corr_df.sort_values('Attribute')

# Visualize the correlations using a grouped bar chart
fig_attr = px.bar(
    corr_df, 
    x='Attribute', 
    y='Correlation with Decision', 
    color='Gender',
    barmode='group',
    title='Impact of Partner Attributes on the Decision to "Swipe Right"',
    labels={
        'Attribute': 'Rated Attribute', 
        'Correlation with Decision': 'Correlation coefficient (r)'
    },
    color_discrete_map={'Female': '#EF553B', 'Male': '#00CC96'},
    text=corr_df['Correlation with Decision'].apply(lambda x: f'{x:.2f}')
)

fig_attr.update_traces(textposition='outside')
fig_attr.update_layout(yaxis_range=[0, 0.6], xaxis_title="Partner Attribute", yaxis_title="Correlation with Positive Decision")
fig_attr.show()
fig_attr.write_html(os.path.join(ASSETS_DIR, "B04-bar_chart-Impact_of_Partner_Attributes.html"))

# ==========================================
# 5. Least Desirable Attributes (Effect Size)
# ==========================================
# Evaluate attributes based on the gender of the partner being rated
# If the evaluating participant is Female (0), the partner is Male (1).
df_behavior['partner_gender'] = df_behavior['gender'].map({0: 'Male', 1: 'Female'})

effect_sizes = []
for p_gender in ['Male', 'Female']:
    subset = df_behavior[df_behavior['partner_gender'] == p_gender]
    
    # Calculate means when decision is 'No' vs 'Yes'
    means_dec0 = subset[subset['dec'] == 0][ratings_cols].mean()
    means_dec1 = subset[subset['dec'] == 1][ratings_cols].mean()
    
    # Calculate difference
    diff = (means_dec1 - means_dec0).reset_index()
    diff.columns = ['Attribute', 'Effect Size (Mean Difference)']
    diff['Partner Gender'] = p_gender
    effect_sizes.append(diff)

df_effects = pd.concat(effect_sizes)
df_effects['Attribute'] = df_effects['Attribute'].str.upper()

# Sort for better visualization
df_effects = df_effects.sort_values(by=['Partner Gender', 'Effect Size (Mean Difference)'], ascending=[True, False])

fig_effect = px.bar(
    df_effects,
    x='Attribute',
    y='Effect Size (Mean Difference)',
    color='Partner Gender',
    barmode='group',
    title="Attribute Effect Size: What drives a 'Yes' (dec=1 vs dec=0)",
    color_discrete_map={'Female': '#EF553B', 'Male': '#00CC96'}
)
fig_effect.update_layout(yaxis_title="Difference in Mean Rating")
fig_effect.show()
fig_effect.write_html(os.path.join(ASSETS_DIR, "B05-bar_effect-Least_desirable_attributes.html"))


### Visualisation Interpretations: Attribute Desirability & Effect Size
* **Most Desirable Attributes (B04):** For both genders, Attractiveness (ATTR) and Fun (FUN) have the strongest positive correlation with a "Yes" decision. The correlation between physical attractiveness and the final decision is noticeably higher for men evaluating women, confirming visual appeal drives male decisions more heavily.
* **Least Desirable Attributes (B05):** Ambition (AMB) and Sincerity (SINC) show the lowest correlation and the smallest Effect Size (mean difference between a "Yes" and "No"). Lacking ambition or sincerity makes a partner only marginally less desirable in a rapid speed-dating context compared to lacking attractiveness or fun, which trigger massive score drops when a "No" is given.

In [15]:
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# 1. Setup Export Directory
ASSETS_DIR = "../assets"
os.makedirs(ASSETS_DIR, exist_ok=True)

# Load the Track 1 checkpoint (Behavioral Data)
df_behavior = pd.read_csv("../data/processed/speed_dating_behavior.csv")

# ==========================================
# 6. Shared Interests vs. Shared Racial Background
# ==========================================
corr_cols_1 = ['dec', 'samerace', 'shar', 'imprace', 'imprelig']
corr_matrix_1 = df_behavior[corr_cols_1].corr()

fig_corr1 = px.imshow(
    corr_matrix_1, 
    text_auto=".2f", 
    color_continuous_scale="RdBu_r", 
    zmin=-1, zmax=1,
    title="Correlation: Shared Interests vs. Shared Race & Decision"
)
fig_corr1.show()
fig_corr1.write_html(os.path.join(ASSETS_DIR, "B06-heatmap-shared_interests_vs_race.html"))

# ==========================================
# 7. Distribution Analysis: Violin & Box Plot Shared Interest vs Decision
# ==========================================
df_behavior['dec_label'] = df_behavior['dec'].map({0: 'No', 1: 'Yes'})

fig_violin = px.violin(
    df_behavior,
    x='dec_label',
    y='shar',
    color='dec_label',
    box=True,       # Embeds a box plot inside the violin
    points=False,   # Hides individual points for a cleaner presentation
    title="Distribution of Shared Interests (SHAR) Rating vs. Final Decision",
    labels={'dec_label': 'Decision', 'shar': 'Shared Interests Score (1-10)'},
    color_discrete_map={'No': '#EF553B', 'Yes': '#00CC96'}
)
fig_violin.update_layout(xaxis_title="Decision (Swipe Right)", yaxis_title="Shared Interests Rating (1-10)")
fig_violin.show()
fig_violin.write_html(os.path.join(ASSETS_DIR, "B07-violin-shared_interest_vs_decision.html"))


# ==========================================
# 8. Scorecard Ratings (Multicollinearity Check)
# ==========================================
ratings_cols = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']
corr_matrix_2 = df_behavior[ratings_cols].corr()

fig_corr2 = px.imshow(
    corr_matrix_2, 
    text_auto=".2f", 
    color_continuous_scale="Viridis", 
    title="Correlation Matrix: Scorecard Ratings (Multicollinearity)"
)
fig_corr2.show()
fig_corr2.write_html(os.path.join(ASSETS_DIR, "B08-heatmap-scorecard_ratings.html"))



### Visualisation Interpretations: Interests, Background, & The Halo Effect
* **Shared Interests vs. Shared Race (B06):** In-the-moment Shared Interests (`shar`) possess a vastly stronger correlation with a positive decision than the binary `samerace` indicator. Behavioral engagement during the date overrides background demographics.
* **Shared Interests Distribution (B07):** The embedded Box plot proves the median Shared Interests rating is higher for a "Yes". However, the Violin density shows many dates resulted in moderately high shared interest scores (5-6/10) but still ended in rejection. Shared interests alone cannot save a date if primary drivers are missing.
* **The "Halo Effect" / Multicollinearity (B08):** The scorecard correlation matrix reveals strong positive correlations between primary traits (Attractiveness) and secondary traits (Fun). Excelling in visual appeal artificially boosts a partner's perceived score in other personality traits.

In [16]:
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# 1. Setup Export Directory
ASSETS_DIR = "../assets"
os.makedirs(ASSETS_DIR, exist_ok=True)

# Load the Track 1 checkpoint (Behavioral Data)
df_behavior = pd.read_csv("../data/processed/speed_dating_behavior.csv")

# ==========================================
# 9. Self-Perception vs. Market Value
# ==========================================

# 1. Calculate Average Received Ratings per Individual (Market Value)
received_ratings = df_behavior.groupby('iid')[['attr', 'sinc', 'intel', 'fun', 'amb']].mean().reset_index()
received_ratings.columns = ['iid', 'ATTR_received', 'SINC_received', 'INTEL_received', 'FUN_received', 'AMB_received']

# 2. Extract Self-Ratings per Individual (Time 1 surveys ending in '3_1')
self_ratings_cols = ['iid', 'attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']
# Drop duplicates since self-ratings are static per individual across all their dates
self_ratings = df_behavior[self_ratings_cols].drop_duplicates()
self_ratings.columns = ['iid', 'ATTR_self', 'SINC_self', 'INTEL_self', 'FUN_self', 'AMB_self']

# 3. Merge and Calculate Global Averages for Comparison
perception_df = pd.merge(self_ratings, received_ratings, on='iid')
mean_comparison = perception_df.drop(columns='iid').mean().reset_index()
mean_comparison.columns = ['Metric', 'Average Score (1-10)']

# Parse the metric names to separate the Attribute from the Perspective (Self vs Received)
mean_comparison['Attribute'] = mean_comparison['Metric'].apply(lambda x: x.split('_')[0])
mean_comparison['Perspective'] = mean_comparison['Metric'].apply(lambda x: 'Self-Rating' if 'self' in x else 'Received Rating')

# 4. Visualize Self-Perception vs. Reality
fig_perception = px.bar(
    mean_comparison, 
    x='Attribute', 
    y='Average Score (1-10)', 
    color='Perspective',
    barmode='group',
    title='Self-Perception vs. Reality: Do people know their market value?',
    color_discrete_map={'Self-Rating': '#636EFA', 'Received Rating': '#FECB52'},
    text=mean_comparison['Average Score (1-10)'].apply(lambda x: f'{x:.1f}')
)
fig_perception.update_traces(textposition='outside')
fig_perception.update_layout(yaxis_range=[0, 10])
fig_perception.show()
fig_perception.write_html(os.path.join(ASSETS_DIR, "B09-bar_chart-Self-Perception_vs_Market_Value.html"))

# ==========================================
# 10. The Order Effect (First vs. Last)
# ==========================================

# 1. Calculate the 'Yes' decision rate per date order
order_stats = df_behavior.groupby('order')['dec'].mean().reset_index()
order_stats['dec_percentage'] = order_stats['dec'] * 100

# Some participants had up to 22 dates, but sample size drops heavily after ~15.
# Filter to the first 15 dates to maintain statistical significance for the visualization
order_stats_filtered = order_stats[order_stats['order'] <= 15]

# 2. Visualize the Order Effect
fig_order = px.line(
    order_stats_filtered, 
    x='order', 
    y='dec_percentage', 
    markers=True,
    title='The Order Effect: Does the sequence of the date matter?',
    labels={'order': 'Date Chronological Order (1st, 2nd, 3rd...)', 'dec_percentage': 'Probability of a "Yes" Decision (%)'}
)

# Add a trendline to make the drop-off clear
fig_order.add_trace(
    go.Scatter(
        x=order_stats_filtered['order'], 
        y=order_stats_filtered['dec_percentage'],
        mode='lines',
        line=dict(dash='dot', color='red'),
        name='Trend'
    )
)

fig_order.update_layout(xaxis=dict(tickmode='linear', tick0=1, dtick=1), yaxis_range=[30, 55])
fig_order.show()
fig_order.write_html(os.path.join(ASSETS_DIR, "B10-line_chart-The_Order_Effect_First_vs_Last.html"))


### Visualisation Interpretations: Self-Perception & Order Dynamics
* **The Overconfidence Effect (B09):** Across every single attribute, the average self-rating is significantly higher than the average rating received from partners. This superiority illusion is most extreme in "Intelligence" and "Sincerity".
* **The Order Effect (B10):** A clear downward trend exists in the positive decision rate as the night progresses. Participants approach a 48-50% "Yes" rate on their first dates, dropping sharply toward 35-40% by the 15th date. This mirrors "swipe fatigue" on apps. It is definitively better to be someone's first date.

In [9]:
# ==========================================
# TRACK 2: STATED PREFERENCES ANALYSIS
# ==========================================

# Load the Track 2 checkpoint (Preferences Data - Waves 6-9 filtered out)
df_pref = pd.read_csv("../data/processed/speed_dating_preferences.csv")

# Ensure gender label exists
if 'gender_label' not in df_pref.columns:
    df_pref['gender_label'] = df_pref['gender'].map({0: 'Female', 1: 'Male'})

# ==========================================
# 1. Stated Preferences (What People Claim They Want)
# ==========================================
pref_cols = {
    'attr1_1': 'ATTR', 'sinc1_1': 'SINC', 'intel1_1': 'INTEL', 
    'fun1_1': 'FUN', 'amb1_1': 'AMB', 'shar1_1': 'SHAR'
}

# Calculate mean allocations
mean_prefs = df_pref.groupby('gender_label')[list(pref_cols.keys())].mean().reset_index()
mean_prefs.rename(columns=pref_cols, inplace=True)

melted_prefs = pd.melt(
    mean_prefs, id_vars=['gender_label'], value_vars=list(pref_cols.values()),
    var_name='Attribute', value_name='Average Points Allocated'
)

melted_prefs['Attribute'] = pd.Categorical(melted_prefs['Attribute'], categories=['ATTR', 'SINC', 'INTEL', 'FUN', 'AMB', 'SHAR'], ordered=True)
melted_prefs = melted_prefs.sort_values(['Attribute', 'gender_label'])

fig_pref1 = px.bar(
    melted_prefs, x='Attribute', y='Average Points Allocated', color='gender_label',
    barmode='group', title='Stated Preferences: What People Claim They Want (100-Point Scale)',
    color_discrete_map={'Female': '#EF553B', 'Male': '#00CC96'},
    text=melted_prefs['Average Points Allocated'].apply(lambda x: f'{x:.1f}')
)
fig_pref1.update_traces(textposition='outside')
fig_pref1.update_layout(yaxis_range=[0, 40], xaxis_title="Partner Attribute", yaxis_title="Mean Points (out of 100)")
fig_pref1.show()
fig_pref1.write_html(os.path.join(ASSETS_DIR, "P01-bar_chart-Stated_Preferences.html"))

# ==========================================
# 2. Perception Mismatch (What I Want vs What You THINK I Want)
# ==========================================
# Series 4_1: What you think the opposite sex looks for
guess_cols = {
    'attr4_1': 'ATTR', 'sinc4_1': 'SINC', 'intel4_1': 'INTEL', 
    'fun4_1': 'FUN', 'amb4_1': 'AMB', 'shar4_1': 'SHAR'
}

# Calculate what men THINK women want, and what women THINK men want
mean_guesses = df_pref.groupby('gender_label')[list(guess_cols.keys())].mean().reset_index()
mean_guesses.rename(columns=guess_cols, inplace=True)

melted_guesses = pd.melt(
    mean_guesses, id_vars=['gender_label'], value_vars=list(guess_cols.values()),
    var_name='Attribute', value_name='Average Points Allocated'
)

# Build a comparative dataframe
# Compare Female Stated (1_1) vs Male Guess (4_1)
female_actual = melted_prefs[melted_prefs['gender_label'] == 'Female'].copy()
female_actual['Perspective'] = "What Women Actually Want"

male_guess = melted_guesses[melted_guesses['gender_label'] == 'Male'].copy()
male_guess['Perspective'] = "What Men THINK Women Want"

mismatch_women = pd.concat([female_actual, male_guess])
mismatch_women['Attribute'] = pd.Categorical(mismatch_women['Attribute'], categories=['ATTR', 'SINC', 'INTEL', 'FUN', 'AMB', 'SHAR'], ordered=True)
mismatch_women = mismatch_women.sort_values('Attribute')

fig_mismatch_w = px.bar(
    mismatch_women, x='Attribute', y='Average Points Allocated', color='Perspective',
    barmode='group', title="Perception Mismatch: Decoding Women's Preferences",
    color_discrete_map={'What Women Actually Want': '#EF553B', 'What Men THINK Women Want': '#FECB52'},
    text=mismatch_women['Average Points Allocated'].apply(lambda x: f'{x:.1f}')
)
fig_mismatch_w.update_traces(textposition='outside')
fig_mismatch_w.update_layout(yaxis_range=[0, 45], xaxis_title="Attribute", yaxis_title="Mean Points (out of 100)")
fig_mismatch_w.show()
fig_mismatch_w.write_html(os.path.join(ASSETS_DIR, "P02-bar_chart-Perception_Mismatch_Women.html"))

# Compare Male Stated (1_1) vs Female Guess (4_1)
male_actual = melted_prefs[melted_prefs['gender_label'] == 'Male'].copy()
male_actual['Perspective'] = "What Men Actually Want"

female_guess = melted_guesses[melted_guesses['gender_label'] == 'Female'].copy()
female_guess['Perspective'] = "What Women THINK Men Want"

mismatch_men = pd.concat([male_actual, female_guess])
mismatch_men['Attribute'] = pd.Categorical(mismatch_men['Attribute'], categories=['ATTR', 'SINC', 'INTEL', 'FUN', 'AMB', 'SHAR'], ordered=True)
mismatch_men = mismatch_men.sort_values('Attribute')

fig_mismatch_m = px.bar(
    mismatch_men, x='Attribute', y='Average Points Allocated', color='Perspective',
    barmode='group', title="Perception Mismatch: Decoding Men's Preferences",
    color_discrete_map={'What Men Actually Want': '#00CC96', 'What Women THINK Men Want': '#636EFA'},
    text=mismatch_men['Average Points Allocated'].apply(lambda x: f'{x:.1f}')
)
fig_mismatch_m.update_traces(textposition='outside')
fig_mismatch_m.update_layout(yaxis_range=[0, 45], xaxis_title="Attribute", yaxis_title="Mean Points (out of 100)")
fig_mismatch_m.show()
fig_mismatch_m.write_html(os.path.join(ASSETS_DIR, "P03-bar_chart-Perception_Mismatch_Men.html"))

### Visualisation Interpretations: Preferences & Market Perception
* **Stated Preferences:** Men heavily prioritize physical attractiveness in their stated profiles, whereas women claim to balance priorities across intelligence, sincerity, and attractiveness.
* **Decoding Preferences (Perception Mismatch):** Information asymmetry plagues the dating market. Men significantly overestimate how much women care about physical attractiveness and underestimate how much they value intelligence. Conversely, women accurately predict that men prioritize physical attractiveness, but wildly overestimate its point value. Both genders suffer from stereotyping, leading to poorly optimized profiles.

In [17]:
# ==========================================
# TRACK 1 & 2: Stated vs. Revealed Preferences (with normalization)
# ==========================================
import pandas as pd
import plotly.express as px
import os

ASSETS_DIR = "../assets"
os.makedirs(ASSETS_DIR, exist_ok=True)

# Ensure both datasets are loaded
df_pref = pd.read_csv("../data/processed/speed_dating_preferences.csv")
df_behavior = pd.read_csv("../data/processed/speed_dating_behavior.csv")

# Map labels if necessary
if 'gender_label' not in df_pref.columns:
    df_pref['gender_label'] = df_pref['gender'].map({0: 'Female', 1: 'Male'})
if 'gender_label' not in df_behavior.columns:
    df_behavior['gender_label'] = df_behavior['gender'].map({0: 'Female', 1: 'Male'})

attributes = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']
attr_labels = ['ATTR', 'SINC', 'INTEL', 'FUN', 'AMB', 'SHAR']

# Step 1: Get Stated Preferences (Means out of 100)
pref_cols = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
stated = df_pref.groupby('gender_label')[pref_cols].mean().reset_index()
stated.columns = ['gender_label'] + attr_labels
stated_melted = stated.melt(id_vars='gender_label', var_name='Attribute', value_name='Score')
stated_melted['Metric'] = 'Stated (What they SAY they want)'

# Step 2: Get Revealed Behaviors (Normalized Correlations)
revealed_data = []
for gender in ['Female', 'Male']:
    subset = df_behavior[df_behavior['gender_label'] == gender]
    corrs = []
    for attr in attributes:
        corrs.append(subset[attr].corr(subset['dec']))
    
    # Normalize the correlations to a 100-point scale for direct comparison
    total_corr = sum(corrs)
    normalized_corrs = [(c / total_corr) * 100 for c in corrs]
    
    for attr_label, norm_score in zip(attr_labels, normalized_corrs):
        revealed_data.append({'gender_label': gender, 'Attribute': attr_label, 'Score': norm_score})

revealed_df = pd.DataFrame(revealed_data)
revealed_df['Metric'] = 'Revealed (What ACTUALLY drives "Yes")'

# Step 3: Combine and Plot
combined_df = pd.concat([stated_melted, revealed_df])
combined_df['Attribute'] = pd.Categorical(combined_df['Attribute'], categories=attr_labels, ordered=True)
combined_df = combined_df.sort_values(['Attribute', 'Metric'])

# Chart for Women
women_df = combined_df[combined_df['gender_label'] == 'Female']
fig_w = px.bar(
    women_df, x='Attribute', y='Score', color='Metric', barmode='group',
    title="Women: Stated vs. Revealed Preferences (100-Point Equivalent)",
    color_discrete_map={'Stated (What they SAY they want)': '#EF553B', 'Revealed (What ACTUALLY drives "Yes")': '#3366CC'},
    text=women_df['Score'].apply(lambda x: f'{x:.1f}')
)
fig_w.update_traces(textposition='outside')
fig_w.update_layout(yaxis_range=[0, 45], yaxis_title="Relative Importance (%)")
fig_w.show()
fig_w.write_html(os.path.join(ASSETS_DIR, "P04-bar_chart-Stated_vs_Revealed_Women.html"))

# Chart for Men
men_df = combined_df[combined_df['gender_label'] == 'Male']
fig_m = px.bar(
    men_df, x='Attribute', y='Score', color='Metric', barmode='group',
    title="Men: Stated vs. Revealed Preferences (100-Point Equivalent)",
    color_discrete_map={'Stated (What they SAY they want)': '#00CC96', 'Revealed (What ACTUALLY drives "Yes")': '#FF9900'},
    text=men_df['Score'].apply(lambda x: f'{x:.1f}')
)
fig_m.update_traces(textposition='outside')
fig_m.update_layout(yaxis_range=[0, 45], yaxis_title="Relative Importance (%)")
fig_m.show()
fig_m.write_html(os.path.join(ASSETS_DIR, "P05-bar_chart-Stated_vs_Revealed_Men.html"))

### Visualisation Interpretations: Stated vs. Revealed Preferences
* **The Reality of Attractiveness:** Women *state* that physical attractiveness makes up only 18% of their criteria, yet behavioral correlations reveal it drives roughly 27.5% of their positive decisions. Men state it at 26.9%, while behavior shows 31.7%.
* **The Sincerity & Intelligence Illusion:** Both genders allocate roughly 20% of their stated preferences to Sincerity and Intelligence. However, in actual 4-minute dates, these traits drove less than 12% of the positive decisions.
* **Business Conclusion for Tinder:** Users aspire to select for deep personality traits, but primal mechanics revert almost entirely to physical attractiveness and immediate charisma. The platform's UI/UX should heavily prioritize high-quality photos and punchy bios rather than long-form personality questionnaires.