In [None]:
import os
import pandas as pd
import numpy as np

# 1. Load the raw data
RAW_DATA_PATH = "../data/raw/speed_dating_data.csv"
df = pd.read_csv(RAW_DATA_PATH, encoding="latin1")

# 2. Define the structural, target, and in-the-moment rating columns to protect
MUST_KEEP = [
    'iid', 'pid', 'wave', 'gender', 'match', 'dec', 'dec_o', 
    'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 'like', 'prob'
]

# 3. Apply Sparsity Filter (50% threshold)
missing_percent = df.isnull().mean() * 100
initial_drop_candidates = missing_percent[missing_percent > 50.0].index.tolist()
cols_to_drop = [col for col in initial_drop_candidates if col not in MUST_KEEP]
df_cleaned = df.drop(columns=cols_to_drop)

# 4. Baseline Imputation (Median for numeric, Mode for categorical)
numeric_cols = df_cleaned.select_dtypes(include=['number']).columns
categorical_cols = df_cleaned.select_dtypes(exclude=['number']).columns

num_imputation = df_cleaned[numeric_cols].median()
cat_imputation = df_cleaned[categorical_cols].mode().iloc[0]

df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(num_imputation)
df_cleaned[categorical_cols] = df_cleaned[categorical_cols].fillna(cat_imputation)

# 5. Create Track 1: Behavioral Data (All Waves)
# This dataset is used for actual date dynamics, match rates, and in-the-moment ratings
df_behavior = df_cleaned.copy()

# 6. Create Track 2: Stated Preferences (Excluding Waves 6-9)
# This dataset ensures the 100-point allocation scale is mathematically consistent
df_preferences = df_cleaned[~df_cleaned['wave'].isin([6, 7, 8, 9])].copy()

# 7. Export the checkpoints
PROCESSED_DATA_DIR = "../data/processed"
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

BEHAVIOR_PATH = os.path.join(PROCESSED_DATA_DIR, "speed_dating_behavior.csv")
PREF_PATH = os.path.join(PROCESSED_DATA_DIR, "speed_dating_preferences.csv")

df_behavior.to_csv(BEHAVIOR_PATH, index=False)
df_preferences.to_csv(PREF_PATH, index=False)

print(f"Two-track processing complete.")
print(f"Track 1 (Behavior - All Waves) shape: {df_behavior.shape} -> Saved to {BEHAVIOR_PATH}")
print(f"Track 2 (Preferences - Filtered) shape: {df_preferences.shape} -> Saved to {PREF_PATH}")

Two-track processing complete.
Track 1 (Behavior - All Waves) shape: (8378, 136) -> Saved to ../data/processed\speed_dating_behavior.csv
Track 2 (Preferences - Filtered) shape: (6816, 136) -> Saved to ../data/processed\speed_dating_preferences.csv


In [5]:
import pandas as pd
import plotly.express as px

# 1. Setup Export Directory
ASSETS_DIR = "../assets"
os.makedirs(ASSETS_DIR, exist_ok=True)

# Load the Track 1 checkpoint (Behavioral Data)
df_behavior = pd.read_csv("../data/processed/speed_dating_behavior.csv")

# Define the in-the-moment rating attributes
attributes = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']

# Map gender for readability if not already mapped
if 'gender_label' not in df_behavior.columns:
    df_behavior['gender_label'] = df_behavior['gender'].map({0: 'Female', 1: 'Male'})

# Calculate correlations between attributes and the decision ('dec') for each gender
correlations = []
for gender in ['Female', 'Male']:
    subset = df_behavior[df_behavior['gender_label'] == gender]
    for attr in attributes:
        # Calculate Pearson correlation between the attribute rating and the binary decision
        corr = subset[attr].corr(subset['dec'])
        correlations.append({
            'Gender': gender,
            'Attribute': attr.upper(),
            'Correlation with Decision': corr
        })

corr_df = pd.DataFrame(correlations)

# Sort attributes to ensure a clean, sorted visualization
# Sorting by the overall average correlation across both genders
order = corr_df.groupby('Attribute')['Correlation with Decision'].mean().sort_values(ascending=False).index
corr_df['Attribute'] = pd.Categorical(corr_df['Attribute'], categories=order, ordered=True)
corr_df = corr_df.sort_values('Attribute')

# Visualize the correlations using a grouped bar chart
fig_attr = px.bar(
    corr_df, 
    x='Attribute', 
    y='Correlation with Decision', 
    color='Gender',
    barmode='group',
    title='Impact of Partner Attributes on the Decision to "Swipe Right"',
    labels={
        'Attribute': 'Rated Attribute', 
        'Correlation with Decision': 'Correlation coefficient (r)'
    },
    color_discrete_map={'Female': '#EF553B', 'Male': '#00CC96'},
    text=corr_df['Correlation with Decision'].apply(lambda x: f'{x:.2f}')
)

fig_attr.update_traces(textposition='outside')
fig_attr.update_layout(yaxis_range=[0, 0.6], xaxis_title="Partner Attribute", yaxis_title="Correlation with Positive Decision")
fig_attr.show()
fig_attr.write_html(os.path.join(ASSETS_DIR, "Impact_of_Partner_Attributes.html"))

### Visualisation Interpretations: Attribute Desirability

* **Most Desirable Attributes:** For both genders, **Attractiveness (ATTR)** and **Fun (FUN)** have the strongest positive correlation with the decision to go on a second date. If a partner is rated highly in these categories, a "Yes" decision is highly probable. 
* **Gender Differences in Attractiveness:** The correlation between physical attractiveness and the final decision is noticeably higher for men evaluating women, confirming that visual appeal drives male dating decisions more heavily than female decisions.
* **Least Desirable Attributes:** **Sincerity (SINC)** and **Ambition (AMB)** have the lowest correlation with a positive decision for both genders. While these traits might be valued in long-term relationships, in the context of a 4-minute speed date (or a Tinder profile), they are the least impactful drivers for securing a match.

In [3]:
import pandas as pd
import plotly.express as px

# Load the Track 2 checkpoint (Preferences Data - Waves 6-9 filtered out)
df_pref = pd.read_csv("../data/processed/speed_dating_preferences.csv")

# Ensure gender label exists
if 'gender_label' not in df_pref.columns:
    df_pref['gender_label'] = df_pref['gender'].map({0: 'Female', 1: 'Male'})

# Define the Time 1 stated preference columns
# "What you look for in the opposite sex" at Time 1
pref_cols = {
    'attr1_1': 'ATTR', 
    'sinc1_1': 'SINC', 
    'intel1_1': 'INTEL', 
    'fun1_1': 'FUN', 
    'amb1_1': 'AMB', 
    'shar1_1': 'SHAR'
}

# Group by gender and calculate the mean points allocated out of 100
mean_prefs = df_pref.groupby('gender_label')[list(pref_cols.keys())].mean().reset_index()

# Rename columns for clarity
mean_prefs.rename(columns=pref_cols, inplace=True)

# Melt the dataframe for Plotly Express compatibility
melted_prefs = pd.melt(
    mean_prefs, 
    id_vars=['gender_label'], 
    value_vars=list(pref_cols.values()),
    var_name='Attribute', 
    value_name='Average Points Allocated (out of 100)'
)

# Sort attributes to match previous charts for consistency
melted_prefs['Attribute'] = pd.Categorical(melted_prefs['Attribute'], categories=['ATTR', 'SINC', 'INTEL', 'FUN', 'AMB', 'SHAR'], ordered=True)
melted_prefs = melted_prefs.sort_values(['Attribute', 'gender_label'])

# Visualize the stated preferences
fig_pref = px.bar(
    melted_prefs, 
    x='Attribute', 
    y='Average Points Allocated (out of 100)', 
    color='gender_label',
    barmode='group',
    title='Stated Preferences: What People Claim They Want (100-Point Allocation)',
    labels={'gender_label': 'Gender', 'Attribute': 'Attribute'},
    color_discrete_map={'Female': '#EF553B', 'Male': '#00CC96'},
    text=melted_prefs['Average Points Allocated (out of 100)'].apply(lambda x: f'{x:.1f}')
)

fig_pref.update_traces(textposition='outside')
fig_pref.update_layout(yaxis_range=[0, 40], xaxis_title="Partner Attribute", yaxis_title="Mean Points (out of 100)")
fig_pref.show()

### Visualisation Interpretations: Stated vs. Revealed Preferences

* **The Behavioral Gap:** Comparing these stated preferences against the previous correlation chart reveals a significant discrepancy between what people *say* they want and how they *act*. 
* **Intelligence & Sincerity:** Both men and women claim that Intelligence (INTEL) and Sincerity (SINC) are highly important to them, allocating a large portion of their 100 points to these traits. However, the Phase 6 correlation analysis showed that these traits have a relatively low impact on actually securing a "Yes" decision.
* **Attractiveness (The Reality):** Men accurately state that attractiveness is their primary criteria (allocating ~27 points on average). Women, however, claim that attractiveness is less important than intelligence or sincerity (~18 points). Yet, the behavioral correlation showed that for women, attractiveness is still the strongest driver for a second date, equal to or greater than fun. 
* **Conclusion:** People are notoriously bad at predicting their own mating criteria. In a Tinder-like environment (rapid evaluation), visual attractiveness and immediate fun heavily override long-term traits like sincerity and ambition, regardless of what users claim on their profiles.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load Track 1 checkpoint (Behavioral Data)
df_behavior = pd.read_csv("../data/processed/speed_dating_behavior.csv")

# ==========================================
# PHASE 8: Self-Perception vs. Market Value
# ==========================================

# 1. Calculate Average Received Ratings per Individual (Market Value)
received_ratings = df_behavior.groupby('iid')[['attr', 'sinc', 'intel', 'fun', 'amb']].mean().reset_index()
received_ratings.columns = ['iid', 'ATTR_received', 'SINC_received', 'INTEL_received', 'FUN_received', 'AMB_received']

# 2. Extract Self-Ratings per Individual (Time 1 surveys ending in '3_1')
self_ratings_cols = ['iid', 'attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']
# Drop duplicates since self-ratings are static per individual across all their dates
self_ratings = df_behavior[self_ratings_cols].drop_duplicates()
self_ratings.columns = ['iid', 'ATTR_self', 'SINC_self', 'INTEL_self', 'FUN_self', 'AMB_self']

# 3. Merge and Calculate Global Averages for Comparison
perception_df = pd.merge(self_ratings, received_ratings, on='iid')
mean_comparison = perception_df.drop(columns='iid').mean().reset_index()
mean_comparison.columns = ['Metric', 'Average Score (1-10)']

# Parse the metric names to separate the Attribute from the Perspective (Self vs Received)
mean_comparison['Attribute'] = mean_comparison['Metric'].apply(lambda x: x.split('_')[0])
mean_comparison['Perspective'] = mean_comparison['Metric'].apply(lambda x: 'Self-Rating' if 'self' in x else 'Received Rating')

# 4. Visualize Self-Perception vs. Reality
fig_perception = px.bar(
    mean_comparison, 
    x='Attribute', 
    y='Average Score (1-10)', 
    color='Perspective',
    barmode='group',
    title='Self-Perception vs. Reality: Do people know their market value?',
    color_discrete_map={'Self-Rating': '#636EFA', 'Received Rating': '#FECB52'},
    text=mean_comparison['Average Score (1-10)'].apply(lambda x: f'{x:.1f}')
)
fig_perception.update_traces(textposition='outside')
fig_perception.update_layout(yaxis_range=[0, 10])
fig_perception.show()


# ==========================================
# PHASE 9: The Order Effect (First vs. Last)
# ==========================================

# 1. Calculate the 'Yes' decision rate per date order
order_stats = df_behavior.groupby('order')['dec'].mean().reset_index()
order_stats['dec_percentage'] = order_stats['dec'] * 100

# Some participants had up to 22 dates, but sample size drops heavily after ~15.
# Filter to the first 15 dates to maintain statistical significance for the visualization
order_stats_filtered = order_stats[order_stats['order'] <= 15]

# 2. Visualize the Order Effect
fig_order = px.line(
    order_stats_filtered, 
    x='order', 
    y='dec_percentage', 
    markers=True,
    title='The Order Effect: Does the sequence of the date matter?',
    labels={'order': 'Date Chronological Order (1st, 2nd, 3rd...)', 'dec_percentage': 'Probability of a "Yes" Decision (%)'}
)

# Add a trendline to make the drop-off clear
fig_order.add_trace(
    go.Scatter(
        x=order_stats_filtered['order'], 
        y=order_stats_filtered['dec_percentage'],
        mode='lines',
        line=dict(dash='dot', color='red'),
        name='Trend'
    )
)

fig_order.update_layout(xaxis=dict(tickmode='linear', tick0=1, dtick=1), yaxis_range=[30, 55])
fig_order.show()

### Visualisation Interpretations: Self-Perception & Order Dynamics

* **The Overconfidence Effect (Self-Perception):** People are generally terrible at predicting their own value. Across every single attribute, the average self-rating is significantly higher than the average rating received from partners. This is most extreme in "Intelligence" (where people rate themselves nearly an 8.5/10, but receive a 7.3/10) and "Sincerity". Participants consistently exhibit a superiority illusion.
* **The Order Effect (First vs. Last Date):** The line chart reveals a clear downward trend in the "Yes" decision rate as the night progresses. 
    * **First Date Advantage:** Participants are most likely to say "Yes" during their very first dates of the evening (approaching a 48-50% positive decision rate).
    * **Decision Fatigue:** By the 15th date, the positive decision rate drops sharply toward 35-40%. 
    * **Tinder Translation:** In dating app mechanics, this mirrors "swipe fatigue." Users are more optimistic and liberal with right-swipes at the beginning of a session, but become progressively more critical (or exhausted) as they review more profiles. Therefore, it is definitively better to be someone's first date of the night.