In [9]:
import pandas as pd

In [10]:
profiles = pd.read_csv("./data/profiles.csv")
viewers = pd.read_csv("./data/viewers.csv")
interactions = pd.read_csv("./data/interactions.csv", names=['datetime', 'viewer_id', 'viewer_name', 'profile_id','profile_name', 'status', 'score'])

### Feature

In [11]:
interactions['target'] = interactions['status'].apply(lambda r: 1 if r.lower() in ['like', 'superlike', 'message', 'match'] else 0)

In [12]:
data = interactions.merge(viewers, how='left', on='viewer_id', suffixes=('', '_viewer'))

In [13]:
profiles = profiles.rename(columns={
    'id': 'profile_id'
})

In [14]:
data = data.merge(profiles, how='left', on='profile_id', suffixes=('', '_profile'))

In [15]:
import numpy as np

In [16]:
data['age_diff'] = np.abs(data['age'] - data['age_profile'])

In [17]:
data['age_diff_norm'] = 1 - (data['age_diff'] / 20).clip(0, 1)

In [18]:
data['same_city'] = (data['city'].str.lower() == data['city_profile'].str.lower()).astype(int)

In [19]:
data['distance_km'] = pd.to_numeric(data['distance_km'], errors='coerce').fillna(9999)

In [20]:
data['distance_score'] = np.exp(-data['distance_km'] / 50)

In [21]:
data['top_interests'].unique()

array([nan, '["Startups", "Board Games", "Tech"]'], dtype=object)

In [22]:
import ast

def safe_parse_list(x):
    try:
        if pd.isna(x):
            return []
        if isinstance(x, list):
            return x
        return ast.literal_eval(x)
    except Exception:
        return []


In [23]:
data['top_interests_list'] = data['top_interests'].apply(safe_parse_list)
data['interests_list'] = data['interests'].apply(safe_parse_list)

In [24]:
def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    if not set1 or not set2:
        return 0
    return len(set1 & set2) / len(set1 | set2)

data['interest_overlap'] = data.apply(
    lambda row: jaccard_similarity(row['top_interests_list'], row['interests_list']),
    axis=1
)

In [25]:
data['seeking_list'] = data['seeking'].apply(safe_parse_list)

In [26]:
def gender_match(row):
    gender = row.get('gender')
    seeking = row.get('seeking_list')

    if not isinstance(seeking, list):
        seeking = []
    if not isinstance(gender, str):
        return 0

    gender = gender.strip().lower()
    seeking = [s.lower().strip() for s in seeking if isinstance(s, str)]

    return 1 if gender in seeking else 0

data['gender_match'] = data.apply(gender_match, axis=1)

In [27]:
data.columns

Index(['datetime', 'viewer_id', 'viewer_name', 'profile_id', 'profile_name',
       'status', 'score', 'target', 'name', 'age', 'city', 'seeking',
       'age_min', 'age_max', 'top_interests', 'w_age', 'w_distance',
       'w_interests', 'created_at', 'updated_at', 'name_profile',
       'age_profile', 'gender', 'region', 'country', 'city_profile',
       'distance_km', 'ethnicity', 'languages', 'religion', 'interests',
       'about', 'photo_url', 'age_diff', 'age_diff_norm', 'same_city',
       'distance_score', 'top_interests_list', 'interests_list',
       'interest_overlap', 'seeking_list', 'gender_match'],
      dtype='object')

## Feature Encoding

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_features = []
mlb = MultiLabelBinarizer(sparse_output=False)
for col in ['top_interests_list','interests_list','seeking_list']:
    if col in data.columns:
        transformed = mlb.fit_transform(data[col])
        temp_df = pd.DataFrame(transformed, columns=[f"{col}_{cls}" for cls in mlb.classes_])
        data = pd.concat([data.reset_index(drop=True), temp_df.reset_index(drop=True)], axis=1)
        mlb_features.extend(temp_df.columns)

In [29]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = []
for c in ['gender','city','religion','ethnicity','region','country','time_bucket']:
    if c in data.columns:
        categorical_cols.append(c)

enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
if categorical_cols:
    cat_encoded = enc.fit_transform(data[categorical_cols].astype(str))
    cat_cols = enc.get_feature_names_out(categorical_cols)
    cat_df = pd.DataFrame(cat_encoded, columns=cat_cols)
    data = pd.concat([data.reset_index(drop=True), cat_df.reset_index(drop=True)], axis=1)


In [30]:
ignore_cols = ['viewer_id','profile_id','timestamp','timestamp_parsed','name','photo_url','outcome','target','target_binary']

In [31]:
numeric_cols = [c for c in data.columns if c not in ignore_cols + categorical_cols + mlb_features and pd.api.types.is_numeric_dtype(data[c])]

In [32]:
X_num = data[numeric_cols + mlb_features + list(cat_df.columns)].copy()
X_num = X_num.fillna(0)

In [33]:
X_num = X_num.loc[:, ~X_num.columns.duplicated()]

In [39]:
y = data['target']

In [34]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_num)

X_scaled = pd.DataFrame(X_scaled, columns=X_num.columns, index=X_num.index)

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

In [44]:
X_train

Unnamed: 0,score,age,age_min,age_max,w_age,w_distance,w_interests,age_profile,distance_km,age_diff,...,interests_list_Standup Comedy,interests_list_Startups,interests_list_Tech,interests_list_Theatre,interests_list_Travel,interests_list_Volunteering,interests_list_Yoga,seeking_list_Man,seeking_list_Non-binary,seeking_list_Woman
37,0.264075,0.632456,0.632456,0.632456,0.632456,0.632456,0.632456,0.485071,-0.266123,0.0,...,-0.27735,0.522233,0.595683,-0.156174,-0.156174,-0.324443,-0.447214,0.632456,0.632456,2.061553
6,0.264075,-1.581139,-1.581139,-1.581139,-1.581139,-1.581139,-1.581139,-2.061553,-0.266123,0.0,...,-0.27735,-1.914854,-1.678744,-0.156174,-0.156174,-0.324443,2.236068,-1.581139,-1.581139,-0.485071
35,0.264075,0.632456,0.632456,0.632456,0.632456,0.632456,0.632456,0.485071,-0.266123,0.0,...,-0.27735,0.522233,0.595683,-0.156174,-0.156174,-0.324443,-0.447214,0.632456,0.632456,2.061553
34,0.264075,0.632456,0.632456,0.632456,0.632456,0.632456,0.632456,0.485071,-0.266123,0.0,...,-0.27735,0.522233,0.595683,-0.156174,-0.156174,-0.324443,-0.447214,0.632456,0.632456,2.061553
10,-1.815518,-1.581139,-1.581139,-1.581139,-1.581139,-1.581139,-1.581139,0.485071,1.969311,0.0,...,3.605551,-1.914854,-1.678744,-0.156174,-0.156174,3.082207,-0.447214,-1.581139,-1.581139,-0.485071
40,0.264075,0.632456,0.632456,0.632456,0.632456,0.632456,0.632456,0.485071,-0.266123,0.0,...,-0.27735,0.522233,0.595683,-0.156174,-0.156174,-0.324443,-0.447214,0.632456,0.632456,2.061553
18,0.264075,0.632456,0.632456,0.632456,0.632456,0.632456,0.632456,0.485071,-0.266123,0.0,...,-0.27735,0.522233,0.595683,-0.156174,-0.156174,-0.324443,-0.447214,0.632456,0.632456,-0.485071
8,0.264075,-1.581139,-1.581139,-1.581139,-1.581139,-1.581139,-1.581139,0.485071,-0.266123,0.0,...,-0.27735,-1.914854,-1.678744,6.403124,-0.156174,-0.324443,2.236068,-1.581139,-1.581139,-0.485071
20,0.264075,0.632456,0.632456,0.632456,0.632456,0.632456,0.632456,0.485071,-0.266123,0.0,...,-0.27735,0.522233,0.595683,-0.156174,-0.156174,-0.324443,-0.447214,0.632456,0.632456,-0.485071
17,0.264075,0.632456,0.632456,0.632456,0.632456,0.632456,0.632456,0.485071,-0.266123,0.0,...,-0.27735,0.522233,0.595683,-0.156174,-0.156174,-0.324443,-0.447214,0.632456,0.632456,-0.485071


In [53]:
[each for each in X_scaled.columns if each.startswith('a')]

[]

In [45]:
profile, embedding
sudhr   ]......]
mony    [.......]

37    0
6     1
35    1
34    0
10    1
40    1
18    1
8     0
20    1
17    0
31    1
26    0
33    1
0     0
25    0
27    0
14    1
4     1
1     1
7     1
9     1
3     1
19    1
5     0
13    0
23    1
22    1
15    0
21    0
Name: target, dtype: int64