# Noisy Detection

In [3]:
### Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import igraph as ig
from sklearn.model_selection import StratifiedShuffleSplit

In [5]:
### User and resource attributes for each dataset

DS_ID = "IoT" # Availables => user_attributes.keys()
# OUT_FILE = open(DS_ID+".output", "w") # File to save the outputs

USER_ATTRS = {"IoT": ["role", "age", "health", "uname"],
              "AMZ": ["MGR_ID","ROLE_ROLLUP_1","ROLE_ROLLUP_2","ROLE_DEPTNAME","ROLE_TITLE","ROLE_FAMILY_DESC","ROLE_FAMILY","ROLE_CODE"],
              "HC": ["position", "uward", "specialties", "teams", "agentfor", "uname"],
              "UN": ["position","departmentu","crsTaken","crsTaught","isChair", "uname"],
              "PM": ["adminroles","projects","departmentu","projectsled","expertiseu","isemployee","task","uname"]}

RES_ATTRS = {"IoT": ["area", "mode", "temperature", "lockstatus", "rname"],
             "AMZ":["RID"],
             "HC": ["type", "author", "patient", "topics", "tratingTeam", "rward", "rname"],
             "UN": ["type","student","department","csr", "rname"],
             "PM": ["type","project","department","expertise","proprietary", "rname"]}

EDG_ATTRS = {"IoT": ["uname", "rname", "location", "timeday", "op", "ACTION"],
             "AMZ": ["uname", "rname"],
             "HC": ["uname", "rname"],
             "UN":["uname", "rname"],
             "PM":["uname", "rname"]}


In [6]:
### Load dataset file

filename = "../data/IOT-AL-universal.csv"

acc_log = pd.read_csv(filename)

# Remove first column (is the old index)
acc_log = acc_log[acc_log.columns[1:]]

# Change the uname string values to int values
acc_log = acc_log.replace(acc_log.uname.unique(), range(len(acc_log.uname.unique())))

# Change the resource id
acc_log["rname"] = acc_log["rname"] + max(acc_log.uname.unique()) + 1

print("Done!")
print(acc_log.head(5)) # Show the first five rows

# OUT_FILE.write("***** "+DS_ID+" Output information *****\n\n") # Title of the document

# Delete some variables
del filename

Done!
    role  age  health  uname         type        area   mode  temperature  \
0  child    3  health      0  Smart locks  livingroom  armed          -10   
1  child    3  health      0  Smart locks  livingroom  armed          -10   
2  child    3  health      0  Smart locks  livingroom  armed          -10   
3  child    3  health      0  Smart locks  livingroom  armed          -10   
4  child    3  health      0  Smart locks  livingroom  armed          -10   

  lockstatus  rname    location timeday       op  ACTION  
0     locked    120  livingroom     day   access       1  
1     locked    120  livingroom     day  control       1  
2     locked    120  livingroom     day      arm       1  
3     locked    120  livingroom  midday   access       1  
4     locked    120  livingroom  midday  control       1  


In [8]:
print("DENY=", len(acc_log[acc_log.ACTION == 0]), " \t PERMIT=",
      len(acc_log[acc_log.ACTION == 1]))

DENY= 1425600  	 PERMIT= 4276800


In [9]:
### Noisy Generation

def get_noisy_al(access_log, fraction, random_state=None, previous_al=None):
    """ Compute the noisy access log (NAL) version. """
    acc_log = access_log
    
    if isinstance(previous_al, pd.DataFrame): # idk
        acc_log = previous_al

    noysi_al = None
    
    if random_state != None:
        noisy_separation = StratifiedShuffleSplit(n_splits = 1,
                                                    train_size = fraction,
                                                    random_state = random_state)
    else:
        noisy_separation = StratifiedShuffleSplit(n_splits = 1,
                                                    train_size = fraction)
    
    noisy_acc_log_selection = noisy_separation.split(acc_log, acc_log.ACTION)
                    
    noisy_acc_req = None
    normal_acc_req = None
    for noisy_idx, normal_idx, in noisy_acc_log_selection:
        noisy_acc_req = acc_log.iloc[noisy_idx]
        normal_acc_req = acc_log.iloc[normal_idx]

    # Change the resolution in the noisy_selection dataset
    noisy_acc_req["ACTION"] = np.where(noisy_acc_req["ACTION"] == 0, 1, 0)

    noysi_al = pd.concat([noisy_acc_req, normal_acc_req])

    return noysi_al

In [10]:
test = get_noisy_al(acc_log, fraction=0.1)

print("DENY=", len(test[test.ACTION == 0]), " \t PERMIT=",
      len(test[test.ACTION == 1]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


DENY= 1710720  	 PERMIT= 3991680


In [32]:
# Edges attributes

edges_attributes = EDG_ATTRS[DS_ID]
#edges = acc_log[edges_attributes].drop_duplicates()
#edges = acc_log.value_counts(edges_attributes, normalize=True)
edges = test.value_counts(["uname", "rname", "ACTION"], normalize=True)
edges = edges.reset_index().rename(columns={0: 'weight'}).to_dict(orient='records')
# edges = edges.reset_index(drop=True)
edges = pd.DataFrame(edges)
edges = edges.rename(columns={'proportion': 'weight'})
#edges["weight"] = edges.value_counts(edges_attributes, normalize=True).values
#OUT_FILE.write("|E| = "+ str(len(edges))+ "\n")
edges.head()

Unnamed: 0,uname,rname,ACTION,weight
0,44,194,1,1.3e-05
1,111,194,1,1.3e-05
2,40,161,1,1.3e-05
3,98,161,1,1.3e-05
4,71,194,1,1.3e-05


In [33]:
edges

Unnamed: 0,uname,rname,ACTION,weight
0,44,194,1,1.315236e-05
1,111,194,1,1.297699e-05
2,40,161,1,1.280163e-05
3,98,161,1,1.280163e-05
4,71,194,1,1.280163e-05
...,...,...,...,...
236875,78,410,0,1.227553e-06
236876,13,368,0,1.052189e-06
236877,24,856,0,1.052189e-06
236878,61,121,0,8.768238e-07


In [34]:
edges[["uname", "rname"]].drop_duplicates()

Unnamed: 0,uname,rname
0,44,194
1,111,194
2,40,161
3,98,161
4,71,194
...,...,...
118761,104,664
118767,21,1058
118769,93,560
118776,108,303


In [35]:

def eliminar_repetidos_con_mayor_peso(df):
    # Ordenar el DataFrame por la columna "weight" en orden descendente
    df_ordenado = df.sort_values(by='weight', ascending=False)
    
    # Mantener la primera aparición de cada combinación única de "name" y "resource"
    df_sin_repetidos = df_ordenado.drop_duplicates(subset=['uname', 'rname'], keep='first')
    
    return df_sin_repetidos

In [36]:
azucar = eliminar_repetidos_con_mayor_peso(edges)
azucar

Unnamed: 0,uname,rname,ACTION,weight
0,44,194,1,0.000013
1,111,194,1,0.000013
2,40,161,1,0.000013
4,71,194,1,0.000013
5,15,194,1,0.000013
...,...,...,...,...
118730,41,281,1,0.000004
118793,96,152,0,0.000004
118792,21,1058,0,0.000004
118776,108,303,1,0.000004


In [37]:
azucar.ACTION.value_counts()

1    118438
0         2
Name: ACTION, dtype: int64

In [38]:
cols_comparar = ['uname', 'rname']
condicion = azucar[cols_comparar].isin(acc_log[cols_comparar])
condicion

Unnamed: 0,uname,rname
0,False,False
1,False,False
2,False,False
4,False,False
5,False,False
...,...,...
118730,False,False
118793,False,False
118792,False,False
118776,False,False


In [39]:
result = azucar[condicion]
result

Unnamed: 0,uname,rname,ACTION,weight
0,,,,
1,,,,
2,,,,
4,,,,
5,,,,
...,...,...,...,...
118730,,,,
118793,,,,
118792,,,,
118776,,,,


In [41]:
acc_log[cols_comparar+["ACTION"]]

Unnamed: 0,uname,rname,ACTION
0,0,120,1
1,0,120,1
2,0,120,1
3,0,120,1
4,0,120,1
...,...,...,...
5702395,119,1109,0
5702396,119,1109,0
5702397,119,1109,0
5702398,119,1109,0
