In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from scipy.optimize import brentq
from scipy.interpolate import interp1d
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from scipy.spatial.distance import euclidean, cityblock

plt.style.use('ggplot')
%matplotlib inline

In [2]:
DATASET_2 = 'DSL-StrongPasswordData.csv'
df = pd.read_csv(DATASET_2)
subject = df['subject']
df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [3]:
def load_data(file_name, col_to_remove):
    df = pd.read_csv(file_name)

    intruders = 5
    sps = 400
    ending = df.shape[0] - intruders*sps

    df2= df.iloc[ending:]
    df = df.iloc[:ending]

    

    H_columns_genuine  = [col for col in df.columns if col.startswith('H')]
    DD_columns_genuine = [col for col in df.columns if col.startswith('DD')]
    UD_columns_genuine = [col for col in df.columns if col.startswith('UD')]

    data = {}
    data['total'] = df.drop(columns=col_to_remove)
    data['H']     = df[H_columns_genuine]
    data['DD']    = df[DD_columns_genuine]
    data['UD']    = df[UD_columns_genuine]
    data['pca3']  = pd.DataFrame(PCA(n_components=3).fit_transform(data['total']))
   # data['pca10'] = pd.DataFrame(PCA(n_components=10).fit_transform(data['total']))
    

    H_columns_imposter  = [col for col in df.columns if col.startswith('H')]
    DD_columns_imposter = [col for col in df.columns if col.startswith('DD')]
    UD_columns_imposter = [col for col in df.columns if col.startswith('UD')]

    data_imposter = {}
    data_imposter['total'] = df2.drop(columns=col_to_remove)
    data_imposter['H']     = df2[H_columns_imposter]
    data_imposter['DD']    = df2[DD_columns_imposter]
    data_imposter['UD']    = df2[UD_columns_imposter]
    data_imposter['pca3']  = pd.DataFrame(PCA(n_components=3).fit_transform(data_imposter['total']))
    

    
    return data, df['subject'].values, data_imposter, df2['subject'].values


def calculate_KMeans(data, y):
    """
    Funzione che addestra il modello KNN su tutti i dati e restituisce il modello addestrato.
    """
    X = data['total']
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
    
    # Ricerca dei migliori iperparametri per KNN
    n_neighbors = [2, 3, 4, 5, 6, 7, 8, 9, 10]
    parameters = dict(n_neighbors=n_neighbors)
    clf = KNeighborsClassifier()
    grid = GridSearchCV(clf, parameters, cv=5)
    grid.fit(X_train, Y_train)
    
    # Ottieni il miglior modello KNN
    best_model = grid.best_estimator_

    # Predizioni per valutazione ---> restituisce il soggetto predetto
    Y_pred = grid.predict(X_test)
    

    return best_model, X_test, Y_test



def test_KMeans(data, y,model):
    """
    Funzione che addestra il modello KNN su tutti i dati e restituisce il modello addestrato.
    """
    X = data['total']
    
    

    # Predizioni per valutazione ---> restituisce il soggetto predetto
    Y_pred = model.predict(X)
    test= model.predict_proba(X)
    #print(', '.join(map(str, test)))
    #print(grid.predict(X_test))
    

    # ROC Curve per il modello
    #fpr, tpr, threshold = roc_curve(pd.get_dummies(Y_test).values.ravel(), pd.get_dummies(Y_pred).values.ravel())

    #return best_model, fpr, tpr, threshold, X, Y_test
    print(', '.join(map(str, test)))
    



def identify_user(model, samples ,threshold=0.5):
    """
    Identifica il soggetto a partire da un campione dato.
    
    Parameters:
    - model: modello KNN allenato.
    - sample: array con le metriche di keystroke dell'utente (1xN).
    
    Returns:
    - Predizione del soggetto più simile.
    """

    
    subject = model.predict(samples)  # Predice il soggetto
   
    probe = np.max(model.predict_proba(samples),axis=1)  # Probabilità di appartenenza
    subject = np.where(probe <= threshold, 'Unknown', subject) # Se la probabilità è bassa, il soggetto è sconosciuto

    return subject,probe




In [4]:
data, y,data_imposter,y_imposter = load_data(DATASET_2, ['subject', 'sessionIndex', 'rep'])
Y = pd.get_dummies(y).values

#eer2_2 = brentq(lambda x : 1. - x - interp1d(fpr2_2, tpr2_2)(x), 0., 1.)
# Otteniamo il modello e i dati di testing
knn_model, X_test, Y_test = calculate_KMeans(data, y)

identify_user(knn_model, X_test)
#test_KMeans(data_imposter, y_imposter,knn_model)


(array(['s007', 'Unknown', 's017', ..., 's017', 'Unknown', 's012'],
       dtype=object),
 array([0.8, 0.4, 1. , ..., 0.8, 0.4, 0.6]))

METRICHE

In [5]:
X_imp = data_imposter['total']

gen_acc=0
imp_acc=0
gen_total= X_test.shape[0]
imp_total= X_imp.shape[0]
id ='pippo'

genuine_y_pred=identify_user(knn_model, X_test)[0]
impostor_y_pred=identify_user(knn_model, X_imp)[0]

gen_acc=np.count_nonzero(np.not_equal(genuine_y_pred, "Unknown"))
imp_acc=np.count_nonzero(np.not_equal(impostor_y_pred, "Unknown"))


print(np.not_equal(genuine_y_pred, "Unknown"))
print('TAR:',gen_acc/gen_total)
print('FRR:',1-gen_acc/gen_total)
print('FAR:',imp_acc/imp_total)
print('TRR:',1-imp_acc/imp_total)   


[ True False  True ...  True False  True]
TAR: 0.7845108695652174
FRR: 0.21548913043478257
FAR: 0.58
TRR: 0.42000000000000004


In [6]:
def test_with_testing_set(X_test, Y_test, model):
    """
    Funzione per testare solo il set di testing.
    """
    correct_predictions = 0
    total_predictions = 0

    for subject_id in np.unique(Y_test):  # Itera su tutti gli ID unici dei soggetti nel testing set
        try:
            # Trova gli indici dei campioni relativi al soggetto corrente
            indices = np.where(Y_test == subject_id)[0]
            
            # Calcola la media delle caratteristiche per il soggetto corrente
            sample_input = X_test.iloc[indices].mean(axis=0).values

            
            # Predici il soggetto usando il modello
            predicted_subject = identify_user(model, sample_input)
            
            # Incrementa il contatore per le predizioni corrette se la predizione è corretta
            if predicted_subject == subject_id:
                correct_predictions += 1
            total_predictions += 1

            # Stampa per ogni soggetto (opzionale)
            print(f"Subject {subject_id}: Predicted = {predicted_subject}, Actual = {subject_id}")

        except ValueError as e:
            print(f"Error for subject {subject_id}: {e}")

    # Calcola la precisione del modello
    accuracy = correct_predictions / total_predictions
    print(f"\nAccuracy for testing set: {accuracy:.4f}")

# Esegui il test
#test_with_testing_set(X_test, Y_test, knn_model)


def test_single_subject(X_test, Y_test, model, subject_id):
    """
    Funzione per testare il modello su un singolo soggetto.
    
    Parameters:
    - X_test: Feature del set di testing.
    - Y_test: Etichette del set di testing.
    - model: Modello KNN addestrato.
    - subject_id: ID del soggetto da testare.
    """
    try:
        # Trova gli indici dei campioni relativi al soggetto specifico
        indices = np.where(Y_test == subject_id)[0]
        
        if len(indices) == 0:
            print(f"Soggetto {subject_id} non presente nel set di testing.")
            return
        
        # Calcola la media delle caratteristiche per il soggetto
        #sample_input = X_test.iloc[indices].mean(axis=0).values
        
        sample_input = X_test.iloc[indices].sample(n=1).values[0]

        # Predici il soggetto usando il modello
        predicted_subject = identify_user(model, sample_input)
        
        # Valuta la predizione
        print(f"Subject {subject_id}: Predicted = {predicted_subject}, Actual = {subject_id}")
        
        # Accuratezza per il soggetto specifico
        if predicted_subject == subject_id:
            print(f"Prediction for Subject {subject_id[0]} is CORRECT!")
        else:
            print(f"Prediction for Subject {subject_id[0]} is INCORRECT.")

    except ValueError as e:
        print(f"Error for subject {subject_id}: {e}")

test_single_subject(X_imp, y_imposter, knn_model, 's054')



def test_single_sample(sample, model, subject_id):
    """
    Funzione per testare il modello su un singolo sample.
    
    Parameters:
    - X_test: Feature del set di testing.
    - Y_test: Etichette del set di testing.
    - model: Modello KNN addestrato.
    - subject_id: ID del soggetto da testare.
    """
    
                

    # Predici il soggetto usando il modello
    predicted_subject,probe = identify_user(model, sample)
    
    # Valuta la predizione
    print(f"Subject {subject_id}: Predicted = {predicted_subject}, Actual = {subject_id}")
    
    # Accuratezza per il soggetto specifico
    if predicted_subject == subject_id:
        return 1

    elif predicted_subject=='Unknown':
        return -1
    else:
        return 0

    

#test_single_subject(X_imp, y_imposter, knn_model, 's054')
test_single_sample(pippo,model)



Error for subject s054: Expected 2D array, got 1D array instead:
array=[ 0.0681  0.2827  0.2146  0.0818  0.1064  0.0246  0.1055  0.0931 -0.0124
  0.0855  0.2653  0.1798  0.0697  0.333   0.2633  0.0628  0.1085  0.0457
  0.0847  0.1357  0.051   0.1058  0.1032 -0.0026  0.062   0.1685  0.1065
  0.0944  0.2341  0.1397  0.1   ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.




NameError: name 'pippo' is not defined