In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd drive/My\ Drive/Tesis

/content/drive/My Drive/Tesis


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

plt.style.use('dark_background')
random_state=42

#### Leer los datos

In [4]:
tesis = pd.read_csv("Datos/Datos_Tesis_Septiembre2020.csv", na_values=' ')
tesis.rename({"TAS_noe":"TAS", "Morisky": "Adherencia"}, axis=1, inplace=True)
tesis.drop(["DSCVisita", "fechavisita", "fecha_visitabasal", "IMC_25", "TA_ALTA",
            "Cintura", "Cadera", "IMC_30", "ICC_alto", "Tabaquismo", "ant_HTA_trat",
            "n_drogas_actual_categ"],
           axis=1, inplace=True)
tesis["Sexo"] = tesis["Sexo"].map({1: 1, 2: 0})
tesis["Fuma"] = tesis["Fuma"].map({0: 0, 1: 1, 2: 1})

In [5]:
tesis.head()

Unnamed: 0,idPaciente,tpo_programa,TAS,Adherencia,Peso,Altura,IMC,DBT,Sexo,Edad,Fuma,ant_HTA,tas_basal,ICC
0,4017,0,106,1,72.0,154.0,30.0,1.0,0,51.0,0.0,1,139,0.943396
1,4017,1,116,1,72.0,154.0,30.0,1.0,0,51.0,0.0,1,139,0.943396
2,4017,2,123,1,72.0,154.0,30.0,1.0,0,51.0,0.0,1,139,0.943396
3,4017,3,130,1,72.0,154.0,30.0,1.0,0,51.0,0.0,1,139,0.943396
4,4017,4,121,1,72.0,154.0,30.0,1.0,0,51.0,0.0,1,139,0.943396


#### Valores faltantes

In [6]:
tesis.isnull().sum()

idPaciente        0
tpo_programa      0
TAS               0
Adherencia        0
Peso            252
Altura          256
IMC             260
DBT             215
Sexo              0
Edad            455
Fuma            215
ant_HTA           0
tas_basal         0
ICC             874
dtype: int64

#### Estadísticas básicas covariables continuas

In [7]:
tesis.describe()

Unnamed: 0,idPaciente,tpo_programa,TAS,Adherencia,Peso,Altura,IMC,DBT,Sexo,Edad,Fuma,ant_HTA,tas_basal,ICC
count,8592.0,8592.0,8592.0,8592.0,8340.0,8336.0,8332.0,8377.0,8592.0,8137.0,8377.0,8592.0,8592.0,7718.0
mean,16805.614758,5.738245,132.418645,0.842993,85.795084,163.854726,31.455833,0.147547,0.489525,59.048789,0.4695,0.90014,148.743017,0.944068
std,2012.288717,4.678866,14.207203,0.363828,17.413438,10.184401,5.494585,0.354671,0.499919,10.165842,0.499099,0.299831,19.307981,0.077841
min,4017.0,0.0,100.0,0.0,43.0,0.0,16.0,0.0,0.0,20.0,0.0,0.0,100.0,0.588832
25%,16629.5,2.0,123.0,1.0,73.0,157.0,27.0,0.0,0.0,53.0,0.0,1.0,135.0,0.894231
50%,17168.0,5.0,132.0,1.0,85.0,164.0,31.0,0.0,0.0,59.0,0.0,1.0,147.0,0.944954
75%,17664.0,9.0,139.0,1.0,97.0,171.0,35.0,0.0,1.0,66.0,1.0,1.0,161.0,1.0
max,18666.0,24.0,198.0,1.0,180.0,191.0,54.0,1.0,1.0,88.0,1.0,1.0,198.0,1.191667


Altura, tiene de valor mínimo 0, lo cual es imposible, por lo tanto voy a imputarlos con NaN

In [8]:
tesis.loc[tesis["Altura"] == 0, "Altura"] = np.nan

In [9]:
tesis.describe()

Unnamed: 0,idPaciente,tpo_programa,TAS,Adherencia,Peso,Altura,IMC,DBT,Sexo,Edad,Fuma,ant_HTA,tas_basal,ICC
count,8592.0,8592.0,8592.0,8592.0,8340.0,8332.0,8332.0,8377.0,8592.0,8137.0,8377.0,8592.0,8592.0,7718.0
mean,16805.614758,5.738245,132.418645,0.842993,85.795084,163.933389,31.455833,0.147547,0.489525,59.048789,0.4695,0.90014,148.743017,0.944068
std,2012.288717,4.678866,14.207203,0.363828,17.413438,9.532828,5.494585,0.354671,0.499919,10.165842,0.499099,0.299831,19.307981,0.077841
min,4017.0,0.0,100.0,0.0,43.0,138.0,16.0,0.0,0.0,20.0,0.0,0.0,100.0,0.588832
25%,16629.5,2.0,123.0,1.0,73.0,157.0,27.0,0.0,0.0,53.0,0.0,1.0,135.0,0.894231
50%,17168.0,5.0,132.0,1.0,85.0,164.0,31.0,0.0,0.0,59.0,0.0,1.0,147.0,0.944954
75%,17664.0,9.0,139.0,1.0,97.0,171.0,35.0,0.0,1.0,66.0,1.0,1.0,161.0,1.0
max,18666.0,24.0,198.0,1.0,180.0,191.0,54.0,1.0,1.0,88.0,1.0,1.0,198.0,1.191667


#### Tratamiento de la variable Morisky

In [10]:
# Crear Adherencia_Acum: variable de indice de performance de adherencia al tratamiento hasta el momento t
tesis["Adherencia_Acumulada"] = tesis.groupby("idPaciente")["Adherencia"].expanding().mean().to_list()
# Crear covariable Morisky_Perf no dependiente del tiempo con el performance final de cada paciente
tesis["Adherencia_Total"] = tesis["idPaciente"].map(tesis.groupby("idPaciente")["Adherencia"].mean().to_dict())

#### Crear covariable TAS_media_acum (TAS hasta el tiempo actual)

In [11]:
# Crear TAS_Media_Acum: variable de TAS media hasta el momento t
tesis["TAS_Media_Acumulada"] = tesis.groupby("idPaciente")["TAS"].expanding().mean().to_list()

#### Acotar el dataset

In [12]:
mes = 6
tesis = tesis[tesis["tpo_programa"] <= mes]
mask = tesis.groupby("idPaciente")["tpo_programa"].max() == mes
ids = mask[mask].index
tesis = tesis[tesis["idPaciente"].isin(ids)]

#### Guardar dataset limpio

In [13]:
# mask = tesis.groupby("idPaciente")["Adherencia_Perf", "TAS"].mean()
# mask = mask[mask["Adherencia_Perf"] == 1]
# mask.sort_values("TAS", inplace=True, ascending=False)
# mask.reset_index(inplace=True)
# ids_to_remove = mask["idPaciente"].tolist()[:int(len(mask)/16*12)]
# tesis = tesis[~tesis["idPaciente"].isin(ids_to_remove)]
# ids_to_remove = mask["idPaciente"].tolist()[int(len(mask)/16*13):]
# tesis = tesis[~tesis["idPaciente"].isin(ids_to_remove)]

In [14]:
tesis[tesis["tpo_programa"] == 0].groupby("Adherencia_Total")["TAS"].mean()

Adherencia_Total
0.000000    137.000000
0.100000    134.000000
0.142857    103.000000
0.166667    163.000000
0.200000    130.000000
               ...    
0.950000    138.500000
0.952381    140.000000
0.954545    135.666667
0.956522    135.000000
1.000000    133.061983
Name: TAS, Length: 91, dtype: float64

In [15]:
tesis.groupby("idPaciente")["Adherencia_Total"].mean().value_counts()

1.000000    242
0.857143     13
0.866667     13
0.750000     12
0.666667     12
           ... 
0.956522      1
0.952381      1
0.352941      1
0.533333      1
0.473684      1
Name: Adherencia_Total, Length: 91, dtype: int64

In [16]:
tesis.to_csv('Datos/tesis_final.csv', index=False)