In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
df_ext = pd.read_csv('../data/raw/dating_app_behavior_dataset_extended1.csv')

In [3]:
df = df_ext.copy()

In [4]:
df['match_outcome'].unique()

array(['Mutual Match', 'Chat Ignored', 'Date Happened', 'No Action',
       'One-sided Like', 'Blocked', 'Relationship Formed', 'Catfished',
       'Ghosted', 'Instant Match'], dtype=object)

Positive: -> 1
- Mutual Match
- Date Happened
- Relationship Formed
- Instant Match

Negative:-> 0
- No Action
- One-sided Like
- Chat Ignored
- Ghosted
- Blocked
- Catfished

In [5]:
# change target to numerical
df['target_binary'] = df['match_outcome'].isin([
    'Mutual Match',
    'Date Happened',
    'Relationship Formed',
    'Instant Match'
]).astype(int)

In [6]:
df['target_binary'].value_counts(normalize=True)

target_binary
0    0.603
1    0.397
Name: proportion, dtype: float64

**Target**
match_outcome   → categorical target
target_binary   → binary target（0/1）

**Features**

1.Numerical -> StandardScaler

app_usage_time_min
swipe_right_ratio
likes_received
mutual_matches
profile_pics_count
bio_length
message_sent_count
emoji_usage_rate
last_active_hour
age
height_cm
weight_kg

2.Ordinal -> Ordinal Encoding

income_bracket
education_level
app_usage_time_label
swipe_right_label

3. Nominal -> One-Hot Encoding

gender
sexual_orientation
location_type
swipe_time_of_day
zodiac_sign
body_type
relationship_intent
interest_tags


In [7]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
numerical_cols

['app_usage_time_min',
 'swipe_right_ratio',
 'likes_received',
 'mutual_matches',
 'profile_pics_count',
 'bio_length',
 'message_sent_count',
 'emoji_usage_rate',
 'last_active_hour',
 'age',
 'height_cm',
 'weight_kg',
 'target_binary']

In [9]:
#VIF methond
X = df[numerical_cols].copy()

In [10]:
X = X.drop(columns=['target_binary'])

In [16]:
from statsmodels.tools.tools import add_constant

X_const = add_constant(X)

In [15]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [14]:
vif_df = pd.DataFrame()
vif_df["feature"] = X_const.columns
vif_df["VIF"] = [
    variance_inflation_factor(X_const.values, i)
    for i in range(X_const.shape[1])
]

vif_df

Unnamed: 0,feature,VIF
0,const,198.174957
1,app_usage_time_min,1.000271
2,swipe_right_ratio,1.000121
3,likes_received,1.044663
4,mutual_matches,1.044624
5,profile_pics_count,1.000733
6,bio_length,1.00018
7,message_sent_count,1.000341
8,emoji_usage_rate,1.000182
9,last_active_hour,1.000451


In [17]:
vif_df[vif_df['VIF'] > 5].sort_values('VIF', ascending=False)

Unnamed: 0,feature,VIF
0,const,198.174957


Variance Inflation Factor (VIF) was calculated to assess multicollinearity among numerical features.
All features showed VIF values close to 1, with the exception of height and weight, which remained well below the commonly used threshold of 5.
This indicates a low level of multicollinearity, suggesting that the features provide largely independent information and that the logistic regression coefficients are stable and interpretable.

In [26]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

In [27]:
categorical_cols = [col for col in categorical_cols if col != 'match_outcome']

In [28]:
from scipy.stats import chi2_contingency

for col in categorical_cols:
    contingency = pd.crosstab(df[col], df['target_binary'])
    chi2, p, dof, expected = chi2_contingency(contingency)

    print(f"{col:25s} p-value = {p:.4e}")

gender                    p-value = 6.4590e-01
sexual_orientation        p-value = 4.8733e-01
location_type             p-value = 6.3417e-01
income_bracket            p-value = 7.2094e-01
education_level           p-value = 1.4193e-01
interest_tags             p-value = 3.5538e-01
app_usage_time_label      p-value = 9.2974e-01
swipe_right_label         p-value = 2.5878e-01
swipe_time_of_day         p-value = 6.5360e-02
zodiac_sign               p-value = 3.3349e-01
body_type                 p-value = 8.1485e-01
relationship_intent       p-value = 3.5005e-01


In [29]:
for col in categorical_cols:
    print(f"\n{col}")
    display(
        pd.crosstab(df[col], df['target_binary'], normalize='index')
    )


gender


target_binary,0,1
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.604127,0.395873
Genderfluid,0.60455,0.39545
Male,0.596002,0.403998
Non-binary,0.609371,0.390629
Prefer Not to Say,0.6012,0.3988
Transgender,0.602658,0.397342



sexual_orientation


target_binary,0,1
sexual_orientation,Unnamed: 1_level_1,Unnamed: 2_level_1
Asexual,0.603614,0.396386
Bisexual,0.593537,0.406463
Demisexual,0.606412,0.393588
Gay,0.597778,0.402222
Lesbian,0.600633,0.399367
Pansexual,0.601732,0.398268
Queer,0.607188,0.392812
Straight,0.612709,0.387291



location_type


target_binary,0,1
location_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Metro,0.603039,0.396961
Remote Area,0.607583,0.392417
Rural,0.60641,0.39359
Small Town,0.599305,0.400695
Suburban,0.596191,0.403809
Urban,0.605378,0.394622



income_bracket


target_binary,0,1
income_bracket,Unnamed: 1_level_1,Unnamed: 2_level_1
High,0.601724,0.398276
Low,0.595345,0.404655
Lower-Middle,0.603681,0.396319
Middle,0.610441,0.389559
Upper-Middle,0.604438,0.395562
Very High,0.601142,0.398858
Very Low,0.604305,0.395695



education_level


target_binary,0,1
education_level,Unnamed: 1_level_1,Unnamed: 2_level_1
Associate’s,0.614839,0.385161
Bachelor’s,0.601665,0.398335
Diploma,0.599229,0.400771
High School,0.605114,0.394886
MBA,0.616108,0.383892
Master’s,0.603126,0.396874
No Formal Education,0.594182,0.405818
PhD,0.592778,0.407222
Postdoc,0.599854,0.400146



interest_tags


target_binary,0,1
interest_tags,Unnamed: 1_level_1,Unnamed: 2_level_1
"Anime, Art, Binge-Watching",0.333333,0.666667
"Anime, Art, Cars",0.500000,0.500000
"Anime, Art, Coding",0.000000,1.000000
"Anime, Art, DIY",0.000000,1.000000
"Anime, Art, Gaming",1.000000,0.000000
...,...,...
"Yoga, Writing, Music",1.000000,0.000000
"Yoga, Writing, Photography",1.000000,0.000000
"Yoga, Writing, Reading",0.000000,1.000000
"Yoga, Writing, Running",0.000000,1.000000



app_usage_time_label


target_binary,0,1
app_usage_time_label,Unnamed: 1_level_1,Unnamed: 2_level_1
Addicted,0.602483,0.397517
Barely,0.593431,0.406569
Extreme User,0.602135,0.397865
High,0.601857,0.398143
Low,0.60355,0.39645
Moderate,0.607121,0.392879
Very Low,0.615527,0.384473



swipe_right_label


target_binary,0,1
swipe_right_label,Unnamed: 1_level_1,Unnamed: 2_level_1
Balanced,0.597101,0.402899
Choosy,0.595002,0.404998
Optimistic,0.606631,0.393369
Swipe Maniac,0.602586,0.397414



swipe_time_of_day


target_binary,0,1
swipe_time_of_day,Unnamed: 1_level_1,Unnamed: 2_level_1
After Midnight,0.60183,0.39817
Afternoon,0.607263,0.392737
Early Morning,0.59375,0.40625
Evening,0.596082,0.403918
Late Night,0.604079,0.395921
Morning,0.615068,0.384932



zodiac_sign


target_binary,0,1
zodiac_sign,Unnamed: 1_level_1,Unnamed: 2_level_1
Aquarius,0.616663,0.383337
Aries,0.608426,0.391574
Cancer,0.605372,0.394628
Capricorn,0.593742,0.406258
Gemini,0.597475,0.402525
Leo,0.61182,0.38818
Libra,0.604673,0.395327
Pisces,0.594788,0.405212
Sagittarius,0.603159,0.396841
Scorpio,0.592495,0.407505



body_type


target_binary,0,1
body_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Athletic,0.601513,0.398487
Average,0.604929,0.395071
Curvy,0.599168,0.400832
Muscular,0.607962,0.392038
Plus Size,0.605321,0.394679
Slim,0.599291,0.400709



relationship_intent


target_binary,0,1
relationship_intent,Unnamed: 1_level_1,Unnamed: 2_level_1
Casual Dating,0.609016,0.390984
Exploring,0.608385,0.391615
Friends Only,0.597073,0.402927
Hookups,0.601844,0.398156
Networking,0.595857,0.404143
Serious Relationship,0.605739,0.394261


Chi-square tests of independence were conducted to examine the relationship between categorical features and the binary match outcome.
Normalized contingency tables were additionally used to explore the direction and magnitude of observed differences.
The results indicate that most demographic and profile-related categorical variables, such as gender, sexual orientation, education level and relationship intent, do not show a statistically significant association with the likelihood of a mutual match.
This suggests that behavioral features may play a more important role than static personal attributes in predicting match outcomes on dating platforms.