# Phase 1: *Data Preprocessing*

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

## Auxiliar functions

In [7]:
file_path = "../data/HC-ALL-positive_ar.csv"
acc_log = pd.read_csv(file_path)
acc_log = acc_log[acc_log.columns[1:]]
acc_log["ACTION"] = 1

print("### *** Access Log loaded *** ###")
print(acc_log.info())

  acc_log = pd.read_csv(file_path)


### *** Access Log loaded *** ###
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686400 entries, 0 to 686399
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   position     499200 non-null  object
 1   uward        499200 non-null  object
 2   specialties  572000 non-null  object
 3   teams        599040 non-null  object
 4   agentfor     524160 non-null  object
 5   uname        686400 non-null  int64 
 6   type         686400 non-null  object
 7   author       686400 non-null  object
 8   patient      686400 non-null  object
 9   topics       686400 non-null  object
 10  tratingTeam  686400 non-null  object
 11  rward        686400 non-null  object
 12  rname        686400 non-null  int64 
 13  ACTION       686400 non-null  int64 
dtypes: int64(3), object(11)
memory usage: 73.3+ MB
None


## 1 Data Splitting

### Auxiliar

In [11]:
def apply_cross_validation(acc_log, k, testSize, randomState=None):
    """Get all indices to split data."""
    if randomState != None:
        kfold = StratifiedShuffleSplit(n_splits=k, test_size=testSize,
                                       random_state=randomState)
    else:
        kfold = StratifiedShuffleSplit(n_splits=k, test_size=testSize)

    data_partition = kfold.split(acc_log, acc_log.ACTION)
    data_corpus = []  # List with all data partitions
    for train_data, test_data in data_partition:
        X_train, X_test = acc_log.iloc[train_data], acc_log.iloc[test_data]
        data_corpus.append([X_train, X_test])

    print("Cross-Validation - DONE")
    print("- k =", k)
    print("- Percentage Train-Test:", (1-testSize)*100, "-", testSize*100, "\n")

    return data_corpus

### Main

In [15]:
cross_validation_settings = {
    "k": 10,
    "test_size": 0.2,
    "random_state": 1
}

data_corpus = apply_cross_validation(acc_log, cross_validation_settings["k"],
                                     cross_validation_settings["test_size"],
                                     cross_validation_settings["random_state"])


# Selection of a split with the id_k ID.
id_k = 0
train_acc_log = data_corpus[id_k][0]
test_acc_log = data_corpus[id_k][1]

print("# Access requests in Train:", len(train_acc_log),
        " %: {:.2f}".format((len(train_acc_log)/(len(train_acc_log)+len(test_acc_log)))*100))
print("# Access requests in Test:", len(test_acc_log),
        " %: {:.2f}".format((len(test_acc_log)/(len(train_acc_log)+len(test_acc_log)))*100))
print("# Access requests:", len(train_acc_log)+len(test_acc_log))

del id_k, cross_validation_settings

Cross-Validation - DONE
- k = 10
- Percentage Train-Test: 80.0 - 20.0 

# Access requests in Train: 549120  %: 80.00
# Access requests in Test: 137280  %: 20.00
# Access requests: 686400


# 2 User and resource identification

## 2 Missing and null data handle

### Auxiliar

In [None]:
def data_cleaning(acc_log, freq_th):
    """Missing and null data handle. Addtionally selection of more frequent
    Access Requests."""
    
    # Loop over column names.
    for col in acc_log.columns:
        

In [36]:
print(acc_log["position"].value_counts(dropna=False).items)

<bound method Series.items of position
nurse     312000
doctor    187200
NaN       187200
Name: count, dtype: int64>


In [37]:
for col in acc_log.columns:
    print(acc_log[col].value_counts(dropna=False))

position
nurse     312000
doctor    187200
NaN       187200
Name: count, dtype: int64
uward
oncward    249600
carward    249600
NaN        187200
Name: count, dtype: int64
specialties
NaN               114400
oncology          114400
cardiology        114400
anesthesiology    114400
pediatrics        114400
neurology         114400
Name: count, dtype: int64
teams
oncTeam2    149760
oncTeam1    149760
carTeam1    149760
carTeam2    149760
NaN          87360
Name: count, dtype: int64
agentfor
oncPat2    262080
carPat2    262080
NaN        162240
Name: count, dtype: int64
uname
143    1495
52     1495
131    1495
83     1495
82     1495
       ... 
318     520
567     520
564     520
321     520
399     520
Name: count, Length: 768, dtype: int64
type
HR    686400
Name: count, dtype: int64
author
oncDoc1      52800
oncNurse2    52800
oncPat1      52800
none         52800
doc1         52800
oncNurse1    52800
oncAgent1    52800
carDoc2      52800
carNurse1    52800
carPat1      52800
doc2  

### Main