In [2]:
#DO NOT CHANGE THIS CODE
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, StratifiedShuffleSplit

import matplotlib.pyplot as plt
from auxiliar_funcs import *

import pmtools2 as pm
import kmodes
%matplotlib inline

In [3]:
# Load data
url_file = '../00-Data/cav_policies.csv'
cav_data = pd.read_csv(url_file)

# Get a smaller sample: 15K positive and 15k negative.
# cav_data = cav_data.groupby('result').sample(n=15000)
print("Columns: ", cav_data.columns)
print("Lenght: ", len(cav_data)); print()

user_attr = ['control', 'monitoring', 'fallback', 'weather', 'visibility', 
        'traffic_congestion']
#user_attr = ['control', 'monitoring', 'fallback']
rsrc_attr = ['driving_task_loa', 'vehicle_loa', 'region_loa']
cav_data = cav_data[user_attr + rsrc_attr + ['result']]

# Change string values to numerical
mapping = {'system': 10101, 'human': 10201, 'human and system': 10301} # Control
cav_data.control = cav_data.control.replace(mapping)

mapping = {'system': 20102, 'human': 20202} # monitoring
cav_data.monitoring = cav_data.monitoring.replace(mapping)

mapping = {'system': 30103, 'human': 30203} # fallbacj
cav_data.fallback = cav_data.fallback.replace(mapping)

mapping = {0: 40004, 1: 40104, 2: 40204, 3: 40304, 4: 40404, 5: 40504}
cav_data.driving_task_loa = cav_data.driving_task_loa.replace(mapping)

mapping = {0: 50005, 1: 50105, 2: 50205, 3: 50305, 4: 50405, 5: 50505}
cav_data.vehicle_loa = cav_data.vehicle_loa.replace(mapping)

mapping = {0: 60006, 1: 60106, 2: 60206, 3: 60306, 4: 60406, 5: 60506}
cav_data.region_loa = cav_data.region_loa.replace(mapping)


print("# User attr:", len(user_attr))
print("# Rsrc attr:", len(rsrc_attr)); print()

# Data statictics:
n_users = len(cav_data[user_attr].drop_duplicates())
n_rscrc = len(cav_data[rsrc_attr].drop_duplicates())
print("|U| =", n_users)
print("|R| =", n_rscrc); print()

# Add user and resource id columns
user_dict = get_user_res(cav_data, user_attr, True)
rsrc_dict = get_user_res(cav_data, rsrc_attr, False)
cav_data = add_col(cav_data, user_dict, user_attr, "USRID")
cav_data = add_col(cav_data, rsrc_dict, rsrc_attr, "RESID")

# Accepted and rejected requests
cav_pos = cav_data[cav_data.result == 'approved']
cav_neg = cav_data[cav_data.result == 'rejected']
print("|L+| =", len(cav_pos), "{:.2f}%".format((len(cav_pos) 
        / len(cav_data) ) * 100))
print("|L-| =", len(cav_neg), "{:.2f}%".format((len(cav_neg) 
        / len(cav_data) ) * 100))

# Cross validation
k = 10
test_size = 0.2
kfold = StratifiedShuffleSplit(n_splits=k, test_size=test_size, random_state=1)

data_partition = kfold.split(cav_data, cav_data.result)
data_curpus = [] # A list to storage the k folds

for train_data, test_data in data_partition:
    X_train, X_test = cav_data.iloc[train_data], cav_data.iloc[test_data]
    data_curpus.append([X_train, X_test])

print("Done!")  
print(" - k =", k)
print(" - Train-Test size: ", len(data_curpus[0][0]), "(", (1-test_size)*100, ") \t", len(data_curpus[0][1]), "(", test_size*100, ")")

Columns:  Index(['driving_task_type', 'control', 'monitoring', 'fallback', 'weather',
       'visibility', 'traffic_congestion', 'environmental_weighted_average',
       'driving_task_loa', 'vehicle_loa', 'region_loa', 'result'],
      dtype='object')
Lenght:  239580

# User attr: 6
# Rsrc attr: 3

|U| = 6655
|R| = 216

|L+| = 118975 49.66%
|L-| = 120605 50.34%
Done!
 - k = 10
 - Train-Test size:  191664 ( 80.0 ) 	 47916 ( 20.0 )


In [4]:
id_kfold = 1

cav_train, cav_test = data_curpus[id_kfold][0], data_curpus[id_kfold][1]
print("# Train access request =", len(cav_train), "{:.2f}%".format(
    len(cav_train)/(len(cav_train)+len(cav_test))*100))
print("# Train access request =", len(cav_test), "{:.2f}%".format(
    len(cav_train)/(len(cav_train)+len(cav_test))*100))
print("Total =", len(cav_train)+len(cav_test)); print()

#### **** SELECT FUNCTIONAL ATTRIBUTES **** ####
cav_train = cav_train[user_attr + rsrc_attr + ['USRID', 'RESID', 'result']]
cav_test = cav_test[user_attr + rsrc_attr + ['USRID', 'RESID', 'result']]

##### ***** Task 1: Null and uknwokn values ***** #####
print("TASK 1: Done!"); print() # NA


##### ***** TASK 2: convert continuous values to categorical values ***** #####
print("TASK 2: Done!"); print() # NA 

##### ***** TASK 3: Drop duplicates access requests ***** #####
print("TASK 3: Drop duplicates access requests")

positive_cav_train = cav_train[cav_train.result=='approved']
positive_cav_test = cav_test[cav_test.result=='approved']
negative_cav_train = cav_train[cav_train.result=='rejected']
negative_cav_test = cav_test[cav_test.result=='rejected']


print(" -TRAIN DATA: Removing", 
    len(positive_cav_train.drop_duplicates()) - 
    len(positive_cav_train), "positive access requests")
print(" -TRAIN DATA: Removing", 
    len(negative_cav_train.drop_duplicates()) - 
    len(negative_cav_train), "negative access requests")
print(" -TEST DATA: Removing", 
    len(positive_cav_test.drop_duplicates()) - 
    len(positive_cav_test), "positive access requests")
print(" -TEST DATA: Removing", 
    len(negative_cav_test.drop_duplicates()) - 
    len(negative_cav_test), "negative access requests")

# Filter resources
#bolean_series = negative_cav_train.RESID.isin(top_list)
#negative_cav_train = negative_cav_train[bolean_series]
print("Hecho!")

# Train access request = 191664 80.00%
# Train access request = 47916 80.00%
Total = 239580

TASK 1: Done!

TASK 2: Done!

TASK 3: Drop duplicates access requests
 -TRAIN DATA: Removing 0 positive access requests
 -TRAIN DATA: Removing 0 negative access requests
 -TEST DATA: Removing 0 positive access requests
 -TEST DATA: Removing 0 negative access requests
Hecho!


In [5]:
###Select the number of clusters###
num_clusters = 20

#DO NOT CHANGE THIS CODE
# seed = 29

#Compute centroids and labels
# num_init = 5
centroids = []
kmodes_huang = kmodes.KModes(n_clusters=num_clusters, init='Huang', verbose=0)
cluster_labels = kmodes_huang.fit_predict(positive_cav_train)
centroids = kmodes_huang.cluster_centroids_

print('Ready!')    

Ready!


In [6]:
positive_cav_train["cls"] = cluster_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
def freq(value, attribute, dataplace):
    """
    Calculate the frequency of the value in the dataplace.

    Parameters
    ----------
    value : int
        Value to compute its frequency.

    attribute : string
        Name of the attribute.

    dataplace : DataFrame pandas
        Data to search.
    
    Returns
    -------
    float [0-1]
        Returns the value of frequency of the value in the data.
    """
    value_freq = dataplace[dataplace[attribute] == value].drop_duplicates()
    return len(value_freq) / len(dataplace)

def freq_rels(attrA, attrB, dataplace):
    """
    Compute the frequency of the attribute relation.

    Parameters
    ----------
    attrA : string
        Name of the attribute A to compare.
    attrB : string
        Name of the attribute B to compare.
    dataplace : DataFrame pandas
        Data to search.

    Returns
    -------
    float [0-1]
        Returns the value of frequency of the range of values in the data.
    """
    # Get the range of values of attribute A.
    range_val_A = set(dataplace[attrA].values.tolist())

    # Get the range of values of attribute B.
    range_val_B = set(dataplace[attrB].values.tolist())

    # Check if the len
    if len(range_val_A) == len(range_val_B):
        # Compute the intersection
        inter_A_B = range_val_A.intersection(range_val_B)
        if len(inter_A_B) == len(range_val_A):
            boolean_series = dataplace[attrA].isin(inter_A_B)
            frac_log = dataplace[boolean_series]
            return len(frac_log) / len(dataplace) # Return the fraction
        return 0
    else:
        return 0    

def extract_attributes_filters(C_i, A, L, posThr, negThr):
    """
    Effective attribute extraction algorithm. Generate a rule for each cluster.

    Parameters
    ----------
    C_i : DataFrame pandas
        Access request in the Cluster i.

    A : List
        List of attributes.

    V : List
        Values of attributes.

    L : DataFrame
        Complete Access Log.

    PosThr : float
        Positive Threshold to the effective positive attribute.

    NegThr : float
        Negative Threshold to the effective negative attribute.

    Returns
    -------
    list
        Returns the rule with the effective attributes for the cluster i.
    """
    filter_to_ret = [] # Rule
    for a in A:        
        a_values = C_i[a].drop_duplicates().tolist()        
        for v in a_values:
            if freq(v, a, C_i) - freq(v, a, L) > posThr:
                if not [a, v] in filter_to_ret:
                    filter_to_ret.append([a, v])
            if freq(v, a, L) - freq(v, a, C_i) > negThr:
                if not [a, -1*v] in filter_to_ret:
                    filter_to_ret.append([a, v*-1])
    return filter_to_ret

def extract_relations(C_i, A, L, posThr, negThr):
    """
    Extract the effective relation. For each cluster.

    Parameters
    ----------
    C_i : DataFrame pandas
        Access request in the Cluster i.

    A : List
        List of attributes.

    L : DataFrame
        Complete Access Log.

    posThr : float
        Positive Threshold to the effective positive relation.

    negThr : float
        Negative Threshold to the effective negative relation.

    Returns
    -------
    list
        Returns the rule with the effective relation for the cluster i.
    """
    relation_to_ret = []
    for a in A:
        for b in A:
            if a != b:
                if freq_rels(a, b, C_i) - freq_rels(a, b, L) > posThr:
                    if not [a, b] in relation_to_ret:
                        relation_to_ret.append([a, b])
                if freq_rels(a, b, L) - freq_rels(a, b, C_i) > negThr:
                    if not [a, '!'+b] in relation_to_ret:                        
                        relation_to_ret.append([a, '!'+b])
                    #print()

def rule_inference(data_, pos_attr_thr, 
    neg_attr_thr, pos_rel_thr, neg_rel_thr):
    rule_list = [] # All rules
    n_cluster = len(data_["cls"].drop_duplicates()) # N clusters
    attrs = data_.columns[:-1] # Name of the columns

    for C_i in range(n_cluster):
        #print(C_i)
        rule_i = []
        data_cluster = data_[data_["cls"] == C_i]
        
        # Effective attributes
        attr_filters = extract_attributes_filters(data_cluster, attrs, data_, 
            pos_attr_thr, neg_attr_thr)    
        rule_i.append(attr_filters)        

        # Relations
        attr_relation = extract_relations(data_cluster, attrs, data_, 
            pos_rel_thr, neg_rel_thr)
        rule_i.append(attr_relation)
        #print(rule_i)

        rule_list.append([C_i, rule_i])    

    return rule_list

In [8]:
df_test = positive_cav_train[['monitoring', 'fallback', 'weather', 'visibility', 'traffic_congestion',
       'driving_task_loa', 'vehicle_loa', 'region_loa', 'cls']]
df_test.columns

Index(['monitoring', 'fallback', 'weather', 'visibility', 'traffic_congestion',
       'driving_task_loa', 'vehicle_loa', 'region_loa', 'cls'],
      dtype='object')

In [9]:
pos_attr_thr = 0.3
neg_attr_thr = 0.2
pos_rel_thr = 0.2
neg_rel_thr = 0.2
test = rule_inference(df_test, pos_attr_thr, neg_attr_thr, pos_rel_thr, neg_attr_thr)
len(test)

20

In [10]:
only_rules = []
for rule in test:
    only_rules.append(rule[1][0])
len(only_rules)

20

In [11]:
# Compute how many rules has a lenght equal to 1.
for idx, rule in enumerate(only_rules):
    if len(rule) < 2:
        print(idx)

In [None]:
del only_rules[11]

In [12]:
false_neg  = []
for i,row in positive_cav_train.iterrows():
    
    # Evaluación
    denies_count = 0    
    for rule in only_rules:                                      
        # En esta parte se evalua la regla completa
        res = True
        
        #for idx_r, attr_val in enumerate(rule):
        for attr_val in rule:            
            if attr_val[1] < 0:
                if row[attr_val[0]] == attr_val[1]*-1:
                    res = False
                    break
            else:
                if row[attr_val[0]] != attr_val[1]:
                    res = False
                    break
        if res == False:
            denies_count += 1
    
    if denies_count == len(only_rules):
        false_neg.append(row)
        #print("FP-2")

FN = len(false_neg)
print("Tasa FN: {:.2f}".format((FN/ len(positive_cav_train))*100))
print("FN: ", FN, " de ", len(positive_cav_train))

Tasa FN: 24.07
FN:  22912  de  95180


In [13]:
false_pos  = []
for i,row in negative_cav_train.iterrows():
    # Evaluación
    denies_count = 0
    temp_rules_n = 0
    for rule in only_rules:                                      
        # En esta parte se evalua la regla completa
        res = True                        
        for attr_val in rule:
            if attr_val[1] < 0:
                if row[attr_val[0]] == attr_val[1]*-1:
                    res = False
                    break
            else:
                if row[attr_val[0]] != attr_val[1]:
                    res = False
                    break
        if res == False:
            denies_count += 1                                
    #print("XXX-", denies_count, temp_rules_n, res)
    if denies_count < len(only_rules):
        false_pos.append(row)
        #print("FP-2")    
    #else:
    #    print("ENtra PAPA")
    
FP = len(false_pos)
print("Tasa FP: {:.2f}".format((FP/ len(negative_cav_train))*100))
print("FN: ", FP, " de ", len(negative_cav_train))

Tasa FP: 40.68
FN:  39251  de  96484


In [14]:

TP = len(positive_cav_train) - FN
#TP = 50 - FN
TN = len(negative_cav_train) - FP
#TN = 50 - FP

precision = TP / (TP + FP)

recall = TP / (TP + FN)

fscore = 2*(precision*recall)/(precision+recall)

print("FN:", FN, " - {:.2f}".format((FN/len(positive_cav_train))*100))
#print("FN:", FN, " - {:.2f}".format((FN/50)*100))
print("FP:", FP, " - {:.2f}".format((FP/len(negative_cav_train))*100))
#print("FP:", FP, " - {:.2f}".format((FP/50)*100))
print("Precision:", precision)
print("Recall:", recall)
print("F-score", fscore)

def compute_wsc(policy):
    return sum([len(rule) for rule in policy])

print("# Rules:", len(only_rules))
print("WSC:", compute_wsc(only_rules))

FN: 22912  - 24.07
FP: 39251  - 40.68
Precision: 0.6480330705978353
Recall: 0.7592771590670309
F-score 0.699258341840067
# Rules: 20
WSC: 79


In [15]:
def convert_rule_to_str(rule):
    rule_str = []
    #print(rule)
    for item in rule:
        rule_str.append(item[0]+'*'+str(item[1]))
    return set(rule_str)

def convert_set_to_list(rule):
    rule_list = []    
    for item in rule:
        rule_list.append([item.split('*')[0], int(item.split('*')[1])])
    return rule_list

def jaccard_similarity(rule_i, rule_j):

    #transofrm data
    rule_i_str = convert_rule_to_str(rule_i)
    rule_j_str = convert_rule_to_str(rule_j)

    intersection = len(list(set(rule_i_str).intersection(rule_j_str)))
    union = len(list(set(rule_i_str).union(rule_j_str)))
    return float( intersection / union ) 

def get_similar_rules(rule_i, all_rules):
    similar_rules = []
    for rule_j in all_rules:
        # Jaccard similarity
        jaccard_sim = jaccard_similarity(rule_i, rule_j)
        if jaccard_sim > 0.5:
            similar_rules.append(rule_j)

    return similar_rules

def fn_refine_policy(fn_rules, all_rules):
    new_rules = all_rules
    for rule in fn_rules:
        
        similar_rules = get_similar_rules(rule, all_rules)

        if len(similar_rules) == 0:
            new_rules.append(rule)
        else:
            for sim_rule in similar_rules:
                rule_str = convert_rule_to_str(rule)
                rule_str_sim = convert_rule_to_str(sim_rule)

                new_filter = rule_str_sim.difference((rule_str_sim.difference(rule_str)))
                print(new_filter)
                new_list_filter = convert_set_to_list(new_filter)
                idx_to_del = new_rules.index(sim_rule)
                del new_rules[idx_to_del]
                new_rules.append(new_list_filter)
    
    return new_rules
                

def fp_refine_policy(fp_rules, all_rules):
    new_rules = all_rules
    for rule in fp_rules:
        
        similar_rules = get_similar_rules(rule, all_rules)

        if len(similar_rules) != 0:                    
            for sim_rule in similar_rules:
                rule_str = convert_rule_to_str(rule)
                rule_str_sim = convert_rule_to_str(sim_rule)

                new_filter = rule_str_sim.difference((rule_str_sim.difference(rule_str)))
                new_list_filter = convert_set_to_list(new_filter)
                idx_to_del = new_rules.index(sim_rule)
                del new_rules[idx_to_del]
                new_rules.append(new_list_filter)
    
    return new_rules

In [16]:
false_neg = pd.DataFrame(false_neg)

In [17]:
###Select the number of clusters###
num_clusters = 5

#DO NOT CHANGE THIS CODE
# seed = 29

#Compute centroids and labels
# num_init = 5
centroids = []
kmodes_huang = kmodes.KModes(n_clusters=num_clusters, init='Huang', verbose=0)
cluster_labels = kmodes_huang.fit_predict(false_neg)
centroids = kmodes_huang.cluster_centroids_

print('Ready!')    

Ready!


In [18]:
false_neg["cls"] = cluster_labels

In [19]:
df_test_2 = false_neg[['monitoring', 'fallback', 'weather', 'visibility', 'traffic_congestion',
       'driving_task_loa', 'vehicle_loa', 'region_loa', 'cls']]

In [20]:
pos_attr_thr = 0.3
neg_attr_thr = 0.2
pos_rel_thr = 0.2
neg_rel_thr = 0.2
test = rule_inference(df_test_2, pos_attr_thr, neg_attr_thr, pos_rel_thr, neg_attr_thr)
len(test)

5

In [21]:
only_rules_2 = []
for rule in test:
    only_rules_2.append(rule[1][0])
len(only_rules_2)

5

In [22]:
# Compute how many rules has a lenght equal to 1.
for idx, rule in enumerate(only_rules_2):
    if len(rule) < 2:
        print(idx)

In [23]:
new_rules = fn_refine_policy(only_rules_2, only_rules)
len(new_rules)

{'driving_task_loa*40004', 'driving_task_loa*-40104', 'region_loa*-60506'}
{'driving_task_loa*40004', 'driving_task_loa*-40104', 'region_loa*-60506'}
{'driving_task_loa*40004', 'driving_task_loa*-40104', 'region_loa*60306', 'region_loa*-60506'}


24

In [27]:
false_neg  = []
for i,row in positive_cav_train.iterrows():
    
    # Evaluación
    denies_count = 0    
    for rule in new_rules:                                      
        # En esta parte se evalua la regla completa
        res = True
        
        #for idx_r, attr_val in enumerate(rule):
        for attr_val in rule:            
            if attr_val[1] < 0:
                if row[attr_val[0]] == attr_val[1]*-1:
                    res = False
                    break
            else:
                if row[attr_val[0]] != attr_val[1]:
                    res = False
                    break
        if res == False:
            denies_count += 1
    
    if denies_count == len(new_rules):
        false_neg.append(row)
        #print("FP-2")

FN = len(false_neg)
print("Tasa FN: {:.2f}".format((FN/ len(positive_cav_train))*100))
print("FN: ", FN, " de ", len(positive_cav_train))

Tasa FN: 8.68
FN:  8262  de  95180


In [28]:
false_pos  = []
for i,row in negative_cav_train.iterrows():
    # Evaluación
    denies_count = 0
    temp_rules_n = 0
    for rule in new_rules:                                      
        # En esta parte se evalua la regla completa
        res = True                        
        for attr_val in rule:
            if attr_val[1] < 0:
                if row[attr_val[0]] == attr_val[1]*-1:
                    res = False
                    break
            else:
                if row[attr_val[0]] != attr_val[1]:
                    res = False
                    break
        if res == False:
            denies_count += 1                                
    #print("XXX-", denies_count, temp_rules_n, res)
    if denies_count < len(new_rules):
        false_pos.append(row)
        #print("FP-2")    
    #else:
    #    print("ENtra PAPA")
FP = len(false_pos)
print("Tasa FP: {:.2f}".format((FP/ len(negative_cav_train))*100))
print("FN: ", FP, " de ", len(negative_cav_train))

Tasa FP: 47.34
FN:  45679  de  96484


In [29]:
TP = len(positive_cav_train) - FN
#TP = 50 - FN
TN = len(negative_cav_train) - FP
#TN = 50 - FP

precision = TP / (TP + FP)

recall = TP / (TP + FN)

fscore = 2*(precision*recall)/(precision+recall)

print("FN:", FN, " - {:.2f}".format((FN/len(positive_cav_train))*100))
#print("FN:", FN, " - {:.2f}".format((FN/50)*100))
print("FP:", FP, " - {:.2f}".format((FP/len(negative_cav_train))*100))
#print("FP:", FP, " - {:.2f}".format((FP/50)*100))
print("Precision:", precision)
print("Recall:", recall)
print("F-score", fscore)

def compute_wsc(policy):
    return sum([len(rule) for rule in policy])

print("# Rules:", len(only_rules))
print("WSC:", compute_wsc(only_rules))

FN: 8262  - 8.68
FP: 45679  - 47.34
Precision: 0.6555050265088954
Recall: 0.91319604959025
F-score 0.7631850450221049
# Rules: 24
WSC: 98
