In [1]:
import scipy.io as sio
from scipy.misc import imread
import numpy as np
import pandas as pd
import sys
import os
import scipy.io
from find_largest_image import find_largest
import tqdm 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from xgboost import XGBClassifier
import random
from numpy.random import choice
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_curve
import sklearn.pipeline as pipeline
import sklearn.preprocessing as preprocessing

In [2]:
data_folder = 'C:\\Users\\Mert\\Documents\\GitHub\\sigver_bmg\\data\\downloaded_pp_features\\mcyt_signet'

In [3]:
user_kernel = 'linear'

In [4]:
data_f = pd.read_csv(os.path.join(data_folder,'data_features.csv'))
visual_f = pd.read_csv(os.path.join(data_folder,'visual_features.csv'))

In [5]:
data_f['sig_id'].unique().max()

15.0

# MODEL SELECTION & TRAINING

In [6]:
fakes_preds = []
gens_preds = []

for fold in np.arange(0,10):
    user_numbers = data_f['user_id'].unique()
    np.random.shuffle(user_numbers)
    dev_user_ids = user_numbers
    validation_user_ids = user_numbers
    exploitation_user_ids = user_numbers

    dev_df = data_f.loc[data_f['user_id'].isin(dev_user_ids)]
    dev_vf = visual_f.loc[dev_df.index]
    val_df = data_f.loc[data_f['user_id'].isin(validation_user_ids)]
    val_vf = visual_f.loc[val_df.index]
    exp_df = data_f.loc[data_f['user_id'].isin(exploitation_user_ids)]
    exp_vf = visual_f.loc[exp_df.index]

    dev_df_gen = dev_df.loc[dev_df['fakeness']==0]
    dev_df_fake = dev_df.loc[dev_df['fakeness']==1]
    dev_df_gen_10 = dev_df_gen.loc[dev_df_gen['sig_id'].isin(np.arange(1,11))]
    
    val_df_gen = val_df.loc[val_df['fakeness']==0]
    val_df_fake = val_df.loc[val_df['fakeness']==1]
    val_df_gen_10 = val_df_gen.loc[val_df_gen['sig_id'].isin(np.arange(1,11))]
    val_df_valid_gen_5 = val_df_gen.loc[val_df_gen['sig_id'].isin(np.arange(11,16))]


    for user_id in tqdm.tqdm(validation_user_ids, ascii=True):
        clf = SVC(C=1,gamma='scale',class_weight='balanced', probability=False, kernel=user_kernel)
        # clf = pipeline.Pipeline([('scaler', preprocessing.StandardScaler(with_mean=False)), ('classifier', orig_clf)])
        y_train = (pd.concat([val_df_gen_10.loc[val_df_gen_10['user_id']==user_id],dev_df_gen.loc[dev_df_gen['user_id']!=user_id]]))['user_id']==user_id
        X_train = visual_f.loc[y_train.index]  
        clf.fit(X_train, y_train)
        y_valid_fakes = val_df_fake.loc[(val_df_fake['user_id']==user_id)]
        X_valid_f = visual_f.loc[y_valid_fakes.index]
        fakes_preds.append(clf.decision_function(X_valid_f))
        y_valid_gens = val_df_valid_gen_5.loc[val_df_valid_gen_5['user_id']==user_id]
        X_valid_g = visual_f.loc[y_valid_gens.index]
        gens_preds.append(clf.decision_function(X_valid_g))


100%|##################################################################################| 75/75 [00:12<00:00,  5.59it/s]
100%|##################################################################################| 75/75 [00:12<00:00,  5.97it/s]
100%|##################################################################################| 75/75 [00:12<00:00,  6.18it/s]
100%|##################################################################################| 75/75 [00:12<00:00,  5.67it/s]
100%|##################################################################################| 75/75 [00:12<00:00,  6.56it/s]
100%|##################################################################################| 75/75 [00:12<00:00,  6.01it/s]
100%|##################################################################################| 75/75 [00:12<00:00,  6.40it/s]
100%|##################################################################################| 75/75 [00:12<00:00,  6.03it/s]
100%|###################################

# GLOBAL THRESHOLD SELECTION

In [7]:
flat_fakes_preds = np.expand_dims(np.array([item for sublist in fakes_preds for item in sublist]),axis=1)
flat_gens_preds = np.expand_dims(np.array([item for sublist in gens_preds for item in sublist]),axis=1)
all_preds = np.vstack((flat_fakes_preds,flat_gens_preds))
all_labels = np.vstack((np.zeros((flat_fakes_preds.shape[0],1)),np.ones((flat_gens_preds.shape[0],1))))

fpr,tpr,threshold = roc_curve(all_labels,all_preds)
fnr = 1 - tpr
EER = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
eer_th = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
print('EER_glob : ', EER*100,'\nEER_Threshold_glob : ', eer_th)

EER_glob :  8.799999999999999 
EER_Threshold_glob :  0.3022383641116666


In [8]:
assert len(fakes_preds)==len(gens_preds)
EER_accum=0
for idx,val in enumerate(fakes_preds):
    user_fakes_preds = np.expand_dims(np.array(fakes_preds[idx]),axis=1)
    user_gens_preds = np.expand_dims(np.array(gens_preds[idx]),axis=1)
    all_user_preds = np.vstack((user_fakes_preds,user_gens_preds))
    all_user_labels = np.vstack((np.zeros((user_fakes_preds.shape[0],1)),np.ones((user_gens_preds.shape[0],1)))) 
    fpr,tpr,threshold = roc_curve(all_user_labels,all_user_preds)
    fnr = 1 - tpr
    EER = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    EER_accum += EER
    
print('EER_user : ', (EER_accum*100)/len(fakes_preds))

EER_user :  3.822222222222214
