# 01 HC Resource Threshold
-------------
In this notebook, I'll analyze the number of access requests to all resources.
The goal is identify what is the minimun of access requests that each resource
must to have to consider it important.



In [1]:
# Libraries
import numpy as np
import pandas as pd
import igraph as ig
import networkx as nx
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from auxiliar_funcs import *
import matplotlib.pyplot as plt
from collections import Counter
from aux_network import build_network_model, bipartite_projection
from aux_plot import calculate_log_binning
from aux_coms import sub_community_detection, add_type_commts
from math import log2, ceil

### Load Data

In [2]:
# Re-order columns by user attributes and resources attributes
user_attr = ['role', 'specialty', 'team', 'uward', 'agentfor']
rsrc_attr = ['type', 'patient', 'treatingteam', 'oward', 'author', 'topic']
print("# User attr:", len(user_attr))
print("# Rsrc attr:", len(rsrc_attr))
print()

##### Load positive access log. #####
url_file = "../00-Data/HC-AccessLog.csv"
df = pd.read_csv(url_file)
df = df[df.columns[:-1]].drop_duplicates()
print("***** POSITIVE HC *****")
print("Columns: ", df.columns)
print("Length: ", len(df))
print(df.info())
print(df.head(5))
print(); print()

# User attr: 5
# Rsrc attr: 6

***** POSITIVE HC *****
Columns:  Index(['action', 'role', 'type', 'oward', 'uward', 'team', 'treatingteam',
       'patient', 'author', 'topic', 'specialty', 'agentfor', 'user'],
      dtype='object')
Length:  17888
<class 'pandas.core.frame.DataFrame'>
Int64Index: 17888 entries, 0 to 17999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   action        17888 non-null  object
 1   role          17888 non-null  object
 2   type          17888 non-null  object
 3   oward         17888 non-null  object
 4   uward         17888 non-null  object
 5   team          17888 non-null  object
 6   treatingteam  17888 non-null  object
 7   patient       17888 non-null  object
 8   author        17888 non-null  object
 9   topic         17888 non-null  object
 10  specialty     17888 non-null  object
 11  agentfor      17888 non-null  object
 12  user          17888 non-null  object
dtypes: objec

In [11]:
df.columns[1:-1]

Index(['role', 'type', 'oward', 'uward', 'team', 'treatingteam', 'patient',
       'author', 'topic', 'specialty', 'agentfor'],
      dtype='object')

In [14]:
sum_values = 0
for col in df.columns:
    sum_values += len(df[col].drop_duplicates())
sum_values

77

In [2]:
# Re-order columns by user attributes and resources attributes
user_attr = ['role', 'specialty', 'team', 'uward', 'agentfor']
rsrc_attr = ['type', 'patient', 'treatingteam', 'oward', 'author', 'topic']
print("# User attr:", len(user_attr))
print("# Rsrc attr:", len(rsrc_attr))
print()

##### Load positive access log. #####
url_file = "../00-Data/hc-positive.csv"
df = pd.read_csv(url_file)
df = df[df.columns[:-1]].drop_duplicates()
print("***** POSITIVE HC *****")
print("Columns: ", df.columns)
print("Length: ", len(df))
print(df.info())
print(df.head(5))
print(); print()

##### Load positive access log. #####
url_file = "../00-Data/hc-negative.csv"
df_neg = pd.read_csv(url_file)
df_neg = df_neg[df_neg.columns[:-2]].drop_duplicates()
df_neg = df_neg.replace("?", "none")
df_neg["action"] = 0
print("***** NEGATIVE HC *****")
print("Columns: ", df_neg.columns)
print("Length: ", len(df_neg))
print(df_neg.info())
print(df_neg.head(5))
print(); print()

##### Concatenar solicitudes + y - #####
df_total = pd.concat([df, df_neg])
print("Columns: ", df_total.columns)
print("Length: ", len(df_total))
print()

##### Calcular porcentaje de solicitudes + y - #####
print("% Solicitudes (+): {:.2f}".format((len(df)/len(df_total))*100))
print("% Solicitudes (-): {:.2f}".format((len(df_neg)/len(df_total))*100))
print(); print()

# Cross-Validation
k = 10
test_size = 0.2
kfold = StratifiedShuffleSplit(n_splits=k, test_size=test_size, random_state=1)

data_partition = kfold.split(df_total, df_total.action)
data_corpus = [] # Lista donde se almacenan los k fols

for train_data, test_data in data_partition:        
    X_train, X_test = df_total.iloc[train_data], df_total.iloc[test_data]
    data_corpus.append([X_train, X_test])

print("Hecho!")
print("- k =",k)
print("- Porcentaje Train-Test:", (1-test_size)*100, "-", test_size*100)

print("|U|:", len(df[user_attr].drop_duplicates()))
print("|R|:", len(df[rsrc_attr].drop_duplicates()))

# User attr: 5
# Rsrc attr: 6

***** POSITIVE HC *****
Columns:  Index(['action', 'role', 'type', 'oward', 'uward', 'team', 'treatingteam',
       'patient', 'author', 'topic', 'specialty', 'agentfor'],
      dtype='object')
Length:  8735
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8735 entries, 0 to 8998
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   action        8735 non-null   int64 
 1   role          8735 non-null   object
 2   type          8735 non-null   object
 3   oward         8735 non-null   object
 4   uward         8735 non-null   object
 5   team          8735 non-null   object
 6   treatingteam  8735 non-null   object
 7   patient       8735 non-null   object
 8   author        8735 non-null   object
 9   topic         8735 non-null   object
 10  specialty     8735 non-null   object
 11  agentfor      8735 non-null   object
dtypes: int64(1), object(11)
memory usage: 887.1+ KB
None
   actio

### Data pre-processing

In [3]:
id_kfold = 0

df_train_k, df_test_k = data_corpus[id_kfold][0], data_corpus[id_kfold][1]
print("# Solicitudes Train:", len(df_train_k), " %: {:.2f}".format((len(df_train_k)/(len(df_train_k)+len(df_test_k)))*100))
print("# Solicitudes Test:", len(df_test_k), " %: {:.2f}".format((len(df_test_k)/(len(df_train_k)+len(df_test_k)))*100))
print("# Solicitudes:", len(df_train_k)+len(df_test_k))
print()

n_rsrcs = len(df_train_k[rsrc_attr].drop_duplicates())

##### ***** TAREA 1: Tratar valores desconocidos y nulos ***** #####
print("Tarea 1: Tratar valores desconocidos y nulos.")
# Cambiar valores string a numeros
mapping = {"none": 10, "doctor": 11, "nurse": 12} # role
df_train_k["role"] = df_train_k["role"].replace(mapping)
df_test_k["role"] = df_test_k["role"].replace(mapping)

mapping = {"note": 110, "cardiology": 111, "nursing": 112, "oncology": 113, "none": 114} # speacialty
df_train_k["specialty"] = df_train_k["specialty"].replace(mapping)
df_test_k["specialty"] = df_test_k["specialty"].replace(mapping)

mapping = {"oncteam1": 1101, "carteam1": 1111,
           "carteam2": 1121, "oncteam2": 1131, "none": 1141} # tem
df_train_k["team"] = df_train_k["team"].replace(mapping)
df_test_k["team"] = df_test_k["team"].replace(mapping)

mapping = {"carward": 11011, "oncward": 11111, "none": 11211} # uward
df_train_k["uward"] = df_train_k["uward"].replace(mapping)
df_test_k["uward"] = df_test_k["uward"].replace(mapping)

mapping = {"oncpat1": 111011, "carpat1": 111111, # agentfor
           "oncpat2": 111211, "carpat2": 111311, "none": 111411}
df_train_k["agentfor"] = df_train_k["agentfor"].replace(mapping)
df_test_k["agentfor"] = df_test_k["agentfor"].replace(mapping)

mapping = {"hr": 1110111, "hritem": 1111111} # type
df_train_k["type"] = df_train_k["type"].replace(mapping)
df_test_k["type"] = df_test_k["type"].replace(mapping)

mapping = {"oncpat1": 211012, "carpat1": 211112, # patient
           "oncpat2": 211212, "carpat2": 211312, "none": 211412}
df_train_k["patient"] = df_train_k["patient"].replace(mapping)
df_test_k["patient"] = df_test_k["patient"].replace(mapping)

mapping = {"oncteam1": 2102, "carteam1": 2112,
           "carteam2": 2122, "oncteam2": 2132, "none": 2142} # treatingteam
df_train_k["treatingteam"] = df_train_k["treatingteam"].replace(mapping)
df_test_k["treatingteam"] = df_test_k["treatingteam"].replace(mapping)

mapping = {"carward": 21012, "oncward": 21112, "none": 21212} # oward
df_train_k["oward"] = df_train_k["oward"].replace(mapping)
df_test_k["oward"] = df_test_k["oward"].replace(mapping)

mapping = {"note": 210, "cardiology": 211, "nursing": 212, "oncology": 213, "none": 214} # topic
df_train_k["topic"] = df_train_k["topic"].replace(mapping)
df_test_k["topic"] = df_test_k["topic"].replace(mapping)

mapping = {"oncdoc2": 11110111, "carnurse1": 11111111, "oncnurse2": 11112111, # author
           "carnurse2": 11113111, "oncdoc1": 11114111, "oncnurse1": 11115111, "none": 11116111}
df_train_k["author"] = df_train_k["author"].replace(mapping)
df_test_k["author"] = df_test_k["author"].replace(mapping)
print("Tarea 1: Hecha!.")
print()

##### ***** TAREA 2: Convertir valores continuas a categoricos ***** #####
print("Tarea 2: No aplica!.")
print()

##### ***** TAREA 3: Eliminar solicitudes duplicadas ***** #####
##### Dataframe solicitudes positivas y negativas #####
print("Tarea 3: Eliminar solicitudes duplicadas.")
df_train_k_pos = df_train_k[df_train_k.action==1]   # Train Pos
df_train_k_neg = df_train_k[df_train_k.action==0]   # Train Neg
df_test_k_pos = df_test_k[df_test_k.action==1]      # Test Pos
df_test_k_neg = df_test_k[df_test_k.action==0]      # Test Neg
df_train_k_pos = df_train_k_pos[df_train_k_pos.columns[1:]].drop_duplicates()
df_train_k_neg = df_train_k_neg[df_train_k_neg.columns[1:]].drop_duplicates()
df_test_k_pos = df_test_k_pos[df_test_k_pos.columns[1:]].drop_duplicates()
df_test_k_neg = df_test_k_neg[df_test_k_neg.columns[1:]].drop_duplicates()
print()

##### Agregar la columna de usuarios y recursos #####
user_dict = get_user_res(df_train_k_pos, user_attr, True)
res_dict = get_user_res(df_train_k_pos, rsrc_attr, False)
df_train_k_pos = add_col(df_train_k_pos, user_dict, user_attr, "USRID")
df_train_k_pos = add_col(df_train_k_pos, res_dict, rsrc_attr, "RESID")
df_train_k_neg = add_col(df_train_k_neg, user_dict, user_attr, "USRID")
df_train_k_neg = add_col(df_train_k_neg, res_dict, rsrc_attr, "RESID")
df_test_k_pos =  add_col(df_test_k_pos, user_dict, user_attr, "USRID")
df_test_k_pos =  add_col(df_test_k_pos, res_dict, user_attr, "RESID")
df_test_k_neg = add_col(df_test_k_neg, user_dict, user_attr, "USRID")
df_test_k_neg = add_col(df_test_k_neg, res_dict, rsrc_attr, "RESID")

print("# Solicitudes Train (+):", len(df_train_k_pos), " %: {:.2f}".format((len(df_train_k_pos)/len(df_train_k))*100))
print("# Solicitudes Train (-):", len(df_train_k_neg), " %: {:.2f}".format((len(df_train_k_neg)/len(df_train_k))*100))
print("# Solicitudes Test (+):", len(df_test_k_pos), " %: {:.2f}".format((len(df_test_k_pos)/len(df_test_k))*100))
print("# Solicitudes Test (-):", len(df_test_k_neg), " %: {:.2f}".format((len(df_test_k_neg)/len(df_test_k))*100))
print("# Train Users (+): ", len(df_train_k_pos.USRID.drop_duplicates()))
print("# Train Resrc (+): ", len(df_train_k_pos.RESID.drop_duplicates()))
print("# Train Users (-): ", len(df_train_k_neg.USRID.drop_duplicates()))
print("# Train Resrc (-): ", len(df_train_k_neg.RESID.drop_duplicates()))
print("# Test Users (+): ", len(df_test_k_pos.USRID.drop_duplicates()))
print("# Test Resrc (+): ", len(df_test_k_pos.RESID.drop_duplicates()))
print("# Test Users (-): ", len(df_test_k_neg.USRID.drop_duplicates()))
print("# Test Resrc (-): ", len(df_test_k_neg.RESID.drop_duplicates()))

task4 = False
if task4:
    # Filter resources
    n1 = 0
    n2 = 210
    top_list = df_train_k_pos.RESID.value_counts()[:n_rsrcs].index.tolist()
    # Filter the interval between n1 and n2
    top_list = top_list[n1:n2+1]
    print('#Filtered resources:', len(top_list))

    # B_II Data splitting
    boolean_series = df_train_k_pos.RESID.isin(top_list)
    df_train_k_pos = df_train_k_pos[boolean_series]
    bolean_series = df_train_k_neg.RESID.isin(top_list)
    df_train_k_neg = df_train_k_neg[bolean_series]

### Create the user and resources dataframes with their attributes.
df_user_attrs = df_train_k_pos[user_attr+["USRID"]].drop_duplicates()
df_rscs_attrs = df_train_k_pos[rsrc_attr+["RESID"]].drop_duplicates()

# Solicitudes Train: 14186  %: 80.00
# Solicitudes Test: 3547  %: 20.00
# Solicitudes: 17733

Tarea 1: Tratar valores desconocidos y nulos.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

Tarea 1: Hecha!.

Tarea 2: No aplica!.

Tarea 3: Eliminar solicitudes duplicadas.

# Solicitudes Train (+): 6988  %: 49.26
# Solicitudes Train (-): 7197  %: 50.73
# Solicitudes Test (+): 1747  %: 49.25
# Solicitudes Test (-): 1800  %: 50.75
# Train Users (+):  1046
# Train Resrc (+):  2090
# Train Users (-):  1045
# Train Resrc (-):  1239
# Test Users (+):  748
# Test Resrc (+):  1
# Test Users (-):  826
# Test Resrc (-):  407


In [4]:
bip_network = build_network_model(df_train_k_pos, 'USRID', 'RESID',
                                  df_user_attrs, df_rscs_attrs)

IGRAPH UN-- 3136 6988 -- 
+ attr: name (v)
<igraph.VertexSeq object at 0x0000022B14F2DB88>


ARBN builded!
IGRAPH UN-- 3136 6988 -- 
+ attr: name (v), typen (v)
|U-Nodes| = 1046
|R-Nodes| = 2090


In [6]:
user_network = bipartite_projection(bip_network, 0)
print(user_network.summary())

IGRAPH UNW- 1032 19509 -- 
+ attr: name (v), rsrcs (v), weight (e)


In [None]:
### Add features to the user network

user_data_attrs = df_train_k_pos[user_attr+["USRID"]].drop_duplicates()
#user_data_attrs[user_data_attrs.USRID == 101]
role_attr = []
specialty_attr = []
team_attr = []
uward_attr = []
agentfor_attr = []
for user in user_network.vs():
    user_row = user_data_attrs[user_data_attrs.USRID == user['name']]    
    role_attr.append(user_row['role'].values[0])
    specialty_attr.append(user_row['specialty'].values[0])
    team_attr.append(user_row['team'].values[0])
    uward_attr.append(user_row['uward'].values[0])
    agentfor_attr.append(user_row['agentfor'].values[0])
    
user_network.vs['role'] = role_attr
user_network.vs['specialty'] = specialty_attr
user_network.vs['team'] = team_attr
user_network.vs['uward'] = uward_attr
user_network.vs['agentfor'] = agentfor_attr
user_network.summary()