In [1]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import sklearn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import pandas as pd

from utils import *

In [2]:
import matplotlib as mpl
import scienceplots

mpl.rcParams['text.usetex'] = True
mpl.rcParams.update(mpl.rcParamsDefault)
plt.style.use(['science', 'grid'])
sklearn_rand_state = 0

In [3]:
# Load data & fix
first_half = pd.read_csv('turning-the-tables/v-1.csv')
second_half = pd.read_csv('turning-the-tables/v-2.csv')

df = pd.concat([first_half, second_half], ignore_index=True)
df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month,x1,x2
0,0,0.1,0.244166,214,3,50,20.66231,15.930899,AA,1922,...,0,INTERNET,8.513914,linux,1,1,0,0,-0.600198,-0.296015
1,0,0.8,0.03569,-1,242,50,6.134627,20.574086,AA,4614,...,0,INTERNET,4.399087,windows,1,1,0,0,-1.406937,2.136468
2,0,0.9,0.030506,-1,95,50,0.001012,-0.591869,AA,1231,...,0,INTERNET,6.98968,linux,0,1,0,0,-0.688785,0.68664
3,0,0.9,0.840711,-1,105,60,0.012292,106.800603,AA,1874,...,0,INTERNET,50.544274,linux,0,1,0,0,-0.865885,-2.078178
4,0,0.7,0.275118,-1,374,60,1.081235,-0.365989,AD,5070,...,0,INTERNET,4.847442,linux,1,1,0,0,-0.339504,-2.032053


In [4]:
df.describe(include='all')

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month,x1,x2
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000,1000000.0,...,1000000.0,1000000,1000000.0,1000000,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
unique,,,,,,,,,5,,...,,2,,5,,,,,,
top,,,,,,,,,AB,,...,,INTERNET,,linux,,,,,,
freq,,,,,,,,,399073,,...,,992330,,337150,,,,,,
mean,0.01103,0.578958,0.4875269,14.744036,99.187295,41.34948,0.9164764,8.571482,,1517.471615,...,0.023991,,7.817692,,0.556301,1.022276,0.0,3.658708,0.010499,0.008503
std,0.104443,0.288226,0.2913674,43.134138,94.070293,13.75192,5.068976,20.54464,,965.945989,...,0.153021,,8.259055,,0.49682,0.192862,0.0,2.116726,1.010044,1.009068
min,0.0,0.1,7.898994e-07,-1.0,-1.0,10.0,1.414624e-07,-15.710457,,1.0,...,0.0,,-1.0,,0.0,-1.0,0.0,0.0,-4.977864,-4.846414
25%,0.0,0.3,0.2145294,-1.0,27.0,30.0,0.007451458,-1.178938,,885.0,...,0.0,,3.151601,,0.0,1.0,0.0,2.0,-0.669626,-0.670979
50%,0.0,0.6,0.4858928,-1.0,64.0,50.0,0.01567347,-0.833821,,1208.0,...,0.0,,5.246432,,1.0,1.0,0.0,4.0,0.005114,0.003902
75%,0.0,0.8,0.7545309,-1.0,154.0,50.0,0.0269882,-0.052483,,1846.0,...,0.0,,9.362126,,1.0,1.0,0.0,5.0,0.683212,0.680875


In [5]:
X = df[df.columns[1:]]
y = df[df.columns[0]]

for c in ['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']:
#     le = sklearn.preprocessing.LabelEncoder()
#     X[c] = le.fit(X[c]).transform(X[c])
    one_hot = pd.get_dummies(X[c])
    X = X.drop(c,axis = 1)
    X = X.join(one_hot)
# X['payment_type', 'source', 'device_os'] = le.transform(X['payment_type', 'source', 'device_os'])

scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
y = y * 2 - 1

In [6]:
# Doublecheck fit
svm = sklearn.linear_model.SGDClassifier(class_weight='balanced', random_state=sklearn_rand_state)
svm.fit(X, y=y)
svm.score(X, y=y), svm.score(X[y==-1.0], y[y==-1.0]), svm.score(X[y==1.0], y[y==1.0])

(0.817679, 0.8178266277035704, 0.8044424297370807)

In [7]:
X.shape

(1000000, 54)

In [8]:
# Find legal indexes
corrects = [i for i in range(len(y)) if svm.predict(X[i].reshape(1, -1)).item() == y[i]]
X = X[corrects]
y = y[corrects]

num_each = 5_000 # Arbitrary max number of pos&neg to reduce runtime
y = np.array(y)
idx_pos = np.where(y == 1)[0][:num_each]
idx_neg = np.where(y == -1)[0][:num_each]
shortened = np.concatenate([idx_neg, idx_pos])
X = X[shortened]
y = y[shortened]
idx_pos = np.where(y == 1)[0][:num_each]
idx_neg = np.where(y == -1)[0][:num_each]

In [None]:
# Doublecheck fit
svm = sklearn.linear_model.SGDClassifier()
svm.fit(X, y=y)
svm.score(X, y=y), svm.score(X[y==-1.0], y[y==-1.0]), svm.score(X[y==1.0], y[y==1.0])

In [9]:
# Doublecheck fit
svm = sklearn.svm.SVC(C=1e10, kernel='linear')
svm.fit(X, y=y)
svm.score(X, y=y), svm.score(X[y==-1.0], y[y==-1.0]), svm.score(X[y==1.0], y[y==1.0])

(1.0, 1.0, 1.0)

In [None]:
# Doublecheck fit
svm = sklearn.svm.SVC(kernel='rbf')
svm.fit(X, y=y)
svm.score(X, y=y), svm.score(X[y==-1.0], y[y==-1.0]), svm.score(X[y==1.0], y[y==1.0])

# All At Once

In [None]:
# Initialization 

SEED = 100
m = 5
p = 0.0
q = 2.0
epochs = 15
n, d = X.shape
rng = np.random.default_rng(SEED)
models = [sklearn.svm.SVC(C=1e10, kernel='linear') for _ in range(m)]
for j in range(m):
    i_pos = idx_pos[rng.integers(len(idx_pos))]
    i_neg = idx_neg[rng.integers(len(idx_neg))]
    models[j].fit(X[[i_pos, i_neg]], y=[1, -1])
# Collect scores
y_hats = []
for j in range(m):
    y_hats.append(cap_utilities(models[j].decision_function(X)))
y_hats = torch.tensor(np.stack(y_hats).T, dtype=torch.float)
M = np.zeros_like(y_hats)

# Setup End-Of-Run Stats
y_hats_eor = []
alphas_eor = []
Ms_eor = []
models_eor = []

# Iterable
for e in range(epochs):
    y_hats_eor.append(y_hats)
    # Optimize alpha
    alpha = opt_alpha(y_hats, quiet=False, round_lr=True, q=q)
    alphas_eor.append(alpha)
    # Update memory
    M = cache_memory(alpha=alpha, mem=M, p=p)
    Ms_eor.append(M)
    # Train new models
    models = []
    for j in range(m):
        sample_weight = M.T[j]
        if not (np.any(sample_weight.numpy() * y > 0) and np.any(sample_weight.numpy() * y < 0)):
            models.append(models_eor[-1][j])
            continue
        models.append(sklearn.svm.SVC(C=1e10, kernel='linear').fit(X, y=y, sample_weight=sample_weight))
    models_eor.append(models)
    # Collect new scores
    y_hats = []
    for j in range(m):
        y_hats.append(cap_utilities(models[j].decision_function(X)))
    y_hats = torch.tensor(np.stack(y_hats).T, dtype=torch.float)

  loss = 0.5 * (alpha @ one_m).T @ (alpha @ one_m) - \
Loss: -4503.05/Max: 1.02:  22%|███████████▋                                         | 221/1000 [04:39<16:12,  1.25s/it]

In [None]:
plt.rcParams.update({'font.size': 20})
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']

str_p = repr(str(p))[1:-1]
title = r'Usages $A_{i,j}$ for' + str_var('p', p) + str_var('m', m) + str_var('q', q) + str_var('s', SEED)[:-1]
ax.set_title(title)

lines = []
for j in range(m):
    line = ax.plot(range(epochs), [torch.sum(a.T[j][idx_pos]) for a in alphas_eor], label=r'$i \in [n^+], j=' + repr(str(j))[1:-1] + r'$', marker=7, color=colors[j],
                  alpha=0.5, markersize=15)
    lines += line
    
for j in range(m):
    line = ax.plot(range(epochs), [torch.sum(a.T[j][idx_neg]) for a in alphas_eor], label=r'$i \in [n^-], j=' + repr(str(j))[1:-1] + r'$', marker='.', color=colors[j],
                  alpha=0.5, markersize=15)
    lines += line
    
ax.set_xlabel(r'Epochs (t)')
ax.set_xticks([0] + list(range(1, epochs, 3)))
# ax.legend(handles = lines[:m+1], loc=7)
plt.show()