In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
df_ext = pd.read_csv('../data/raw/dating_app_behavior_dataset_extended1.csv')

In [3]:
df = df_ext.copy()

In [4]:
df

Unnamed: 0,gender,sexual_orientation,location_type,income_bracket,education_level,interest_tags,app_usage_time_min,app_usage_time_label,swipe_right_ratio,swipe_right_label,...,emoji_usage_rate,last_active_hour,swipe_time_of_day,match_outcome,age,height_cm,weight_kg,zodiac_sign,body_type,relationship_intent
0,Prefer Not to Say,Gay,Urban,High,Bachelor’s,"Fitness, Politics, Traveling",52,Moderate,0.60,Optimistic,...,0.36,13,Early Morning,Mutual Match,56,149,40.6,Taurus,Curvy,Friends Only
1,Male,Bisexual,Suburban,Upper-Middle,No Formal Education,"Languages, Fashion, Parenting",279,Extreme User,0.56,Optimistic,...,0.42,0,Morning,Chat Ignored,40,155,69.7,Leo,Plus Size,Hookups
2,Non-binary,Pansexual,Suburban,Low,Master’s,"Movies, Reading, DIY",49,Moderate,0.41,Optimistic,...,0.41,1,After Midnight,Date Happened,30,185,96.9,Sagittarius,Curvy,Serious Relationship
3,Genderfluid,Gay,Metro,Very Low,Postdoc,"Coding, Podcasts, History",185,Extreme User,0.32,Balanced,...,0.07,21,Morning,No Action,57,154,49.3,Taurus,Slim,Exploring
4,Male,Bisexual,Urban,Middle,Bachelor’s,"Clubbing, Podcasts, Cars",83,High,0.32,Balanced,...,0.11,22,After Midnight,One-sided Like,24,149,40.0,Libra,Slim,Casual Dating
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Transgender,Gay,Metro,Very High,Postdoc,"Gaming, Writing, Painting",130,Addicted,0.69,Optimistic,...,0.19,15,Afternoon,Ghosted,45,195,102.7,Pisces,Plus Size,Casual Dating
49996,Female,Lesbian,Small Town,Low,Diploma,"Traveling, Fitness, Stand-up Comedy",277,Extreme User,0.43,Optimistic,...,0.23,2,Evening,Ghosted,26,185,79.7,Leo,Athletic,Friends Only
49997,Male,Bisexual,Remote Area,High,High School,"Gaming, Motorcycling, Art",73,High,0.50,Optimistic,...,0.36,5,After Midnight,Blocked,45,176,69.1,Pisces,Athletic,Serious Relationship
49998,Transgender,Queer,Urban,Low,MBA,"Skating, Astrology, Hiking",100,High,0.34,Balanced,...,0.36,16,Evening,One-sided Like,29,171,83.6,Libra,Athletic,Friends Only


In [5]:
# change target to numerical
df['target_binary'] = df['match_outcome'].isin([
    'Mutual Match',
    'Date Happened',
    'Relationship Formed',
    'Instant Match'
]).astype(int)

In [6]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [7]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

In [8]:
df['income_bracket'].unique()

array(['High', 'Upper-Middle', 'Low', 'Very Low', 'Middle',
       'Lower-Middle', 'Very High'], dtype=object)

In [9]:
df['education_level'].unique()

array(['Bachelor’s', 'No Formal Education', 'Master’s', 'Postdoc',
       'Associate’s', 'High School', 'Diploma', 'PhD', 'MBA'],
      dtype=object)

In [10]:
df['app_usage_time_label'].unique()

array(['Moderate', 'Extreme User', 'High', 'Addicted', 'Barely',
       'Very Low', 'Low'], dtype=object)

In [11]:
df['swipe_right_label'].unique()

array(['Optimistic', 'Balanced', 'Swipe Maniac', 'Choosy'], dtype=object)

In [13]:
income_order = ['Very Low','Low','Lower-Middle','Middle','Upper-Middle','High','Very High']

In [14]:
education_order = ['No Formal Education','High School','Diploma','Associate’s','Bachelor’s','MBA','Master’s','PhD','Postdoc']

In [15]:
usage_order = ['Very Low','Barely','Low', 'Moderate','High','Extreme User','Addicted']

In [16]:
swipe_order = ['Choosy','Balanced','Optimistic','Swipe Maniac']

In [17]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_cols = [
    'income_bracket',
    'education_level',
    'app_usage_time_label',
    'swipe_right_label'
]

encoder = OrdinalEncoder(
    categories=[
        income_order,
        education_order,
        usage_order,
        swipe_order
    ],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

df[ordinal_cols] = encoder.fit_transform(df[ordinal_cols])

In [25]:
df[ordinal_cols] 

Unnamed: 0,income_bracket,education_level,app_usage_time_label,swipe_right_label
0,5.0,4.0,3.0,2.0
1,4.0,0.0,5.0,2.0
2,1.0,6.0,3.0,2.0
3,0.0,8.0,5.0,1.0
4,3.0,4.0,4.0,1.0
...,...,...,...,...
49995,6.0,8.0,6.0,2.0
49996,1.0,2.0,5.0,2.0
49997,5.0,1.0,4.0,2.0
49998,1.0,5.0,4.0,1.0


In [18]:
df

Unnamed: 0,gender,sexual_orientation,location_type,income_bracket,education_level,interest_tags,app_usage_time_min,app_usage_time_label,swipe_right_ratio,swipe_right_label,...,last_active_hour,swipe_time_of_day,match_outcome,age,height_cm,weight_kg,zodiac_sign,body_type,relationship_intent,target_binary
0,Prefer Not to Say,Gay,Urban,5.0,4.0,"Fitness, Politics, Traveling",52,3.0,0.60,2.0,...,13,Early Morning,Mutual Match,56,149,40.6,Taurus,Curvy,Friends Only,1
1,Male,Bisexual,Suburban,4.0,0.0,"Languages, Fashion, Parenting",279,5.0,0.56,2.0,...,0,Morning,Chat Ignored,40,155,69.7,Leo,Plus Size,Hookups,0
2,Non-binary,Pansexual,Suburban,1.0,6.0,"Movies, Reading, DIY",49,3.0,0.41,2.0,...,1,After Midnight,Date Happened,30,185,96.9,Sagittarius,Curvy,Serious Relationship,1
3,Genderfluid,Gay,Metro,0.0,8.0,"Coding, Podcasts, History",185,5.0,0.32,1.0,...,21,Morning,No Action,57,154,49.3,Taurus,Slim,Exploring,0
4,Male,Bisexual,Urban,3.0,4.0,"Clubbing, Podcasts, Cars",83,4.0,0.32,1.0,...,22,After Midnight,One-sided Like,24,149,40.0,Libra,Slim,Casual Dating,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Transgender,Gay,Metro,6.0,8.0,"Gaming, Writing, Painting",130,6.0,0.69,2.0,...,15,Afternoon,Ghosted,45,195,102.7,Pisces,Plus Size,Casual Dating,0
49996,Female,Lesbian,Small Town,1.0,2.0,"Traveling, Fitness, Stand-up Comedy",277,5.0,0.43,2.0,...,2,Evening,Ghosted,26,185,79.7,Leo,Athletic,Friends Only,0
49997,Male,Bisexual,Remote Area,5.0,1.0,"Gaming, Motorcycling, Art",73,4.0,0.50,2.0,...,5,After Midnight,Blocked,45,176,69.1,Pisces,Athletic,Serious Relationship,0
49998,Transgender,Queer,Urban,1.0,5.0,"Skating, Astrology, Hiking",100,4.0,0.34,1.0,...,16,Evening,One-sided Like,29,171,83.6,Libra,Athletic,Friends Only,0


In [19]:
df['swipe_right_label'].unique()

array([2., 1., 3., 0.])

In [20]:
df['education_level'].unique()

array([4., 0., 6., 8., 3., 1., 2., 7., 5.])

In [22]:
df['app_usage_time_label'].unique()

array([3., 5., 4., 6., 1., 0., 2.])

In [23]:
df['swipe_right_label'].unique()

array([2., 1., 3., 0.])

In [24]:
df

Unnamed: 0,gender,sexual_orientation,location_type,income_bracket,education_level,interest_tags,app_usage_time_min,app_usage_time_label,swipe_right_ratio,swipe_right_label,...,last_active_hour,swipe_time_of_day,match_outcome,age,height_cm,weight_kg,zodiac_sign,body_type,relationship_intent,target_binary
0,Prefer Not to Say,Gay,Urban,5.0,4.0,"Fitness, Politics, Traveling",52,3.0,0.60,2.0,...,13,Early Morning,Mutual Match,56,149,40.6,Taurus,Curvy,Friends Only,1
1,Male,Bisexual,Suburban,4.0,0.0,"Languages, Fashion, Parenting",279,5.0,0.56,2.0,...,0,Morning,Chat Ignored,40,155,69.7,Leo,Plus Size,Hookups,0
2,Non-binary,Pansexual,Suburban,1.0,6.0,"Movies, Reading, DIY",49,3.0,0.41,2.0,...,1,After Midnight,Date Happened,30,185,96.9,Sagittarius,Curvy,Serious Relationship,1
3,Genderfluid,Gay,Metro,0.0,8.0,"Coding, Podcasts, History",185,5.0,0.32,1.0,...,21,Morning,No Action,57,154,49.3,Taurus,Slim,Exploring,0
4,Male,Bisexual,Urban,3.0,4.0,"Clubbing, Podcasts, Cars",83,4.0,0.32,1.0,...,22,After Midnight,One-sided Like,24,149,40.0,Libra,Slim,Casual Dating,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Transgender,Gay,Metro,6.0,8.0,"Gaming, Writing, Painting",130,6.0,0.69,2.0,...,15,Afternoon,Ghosted,45,195,102.7,Pisces,Plus Size,Casual Dating,0
49996,Female,Lesbian,Small Town,1.0,2.0,"Traveling, Fitness, Stand-up Comedy",277,5.0,0.43,2.0,...,2,Evening,Ghosted,26,185,79.7,Leo,Athletic,Friends Only,0
49997,Male,Bisexual,Remote Area,5.0,1.0,"Gaming, Motorcycling, Art",73,4.0,0.50,2.0,...,5,After Midnight,Blocked,45,176,69.1,Pisces,Athletic,Serious Relationship,0
49998,Transgender,Queer,Urban,1.0,5.0,"Skating, Astrology, Hiking",100,4.0,0.34,1.0,...,16,Evening,One-sided Like,29,171,83.6,Libra,Athletic,Friends Only,0
