# Metodología de privatización

## Manipulación de datos

Primero se carga la base que nos suministro el ministerio de salud

In [1]:
import pickle
import pandas as pd

archivo_pickle = 'Entrenamiento_cronicas_N.pkl'


with open(archivo_pickle, 'rb') as file:
    df = pickle.load(file)


In [6]:
df.dtypes

EPS                                        object
NUM_IDE                                    object
FECHA_NACI                                 object
SEXO                                       object
DPTO                                       object
MUNI                                       object
DIAS_COMP                                  object
V8                                         object
id                                         object
Valor_Neto_Chunk                          float64
Valor_Neto                                float64
DIAS_ESTAN_Neto_Chunk                     float64
DIAS_ESTAN_Neto                           float64
Edad                                      float64
SIDA_VIH                                  float64
TUBERCULOSIS                              float64
CANCER_OTROSCANCER                        float64
CANCER_ORGANOSDIGESTIVOS                  float64
CANCER_ORGANOSRESPIRATORIOS               float64
CANCER_MELANOMAYDELAPIEL                  float64


In [44]:
import torch
import pandas as pd

class AlgorithmDP:
    def __init__(self, df, epsilon):
        self.df = df.copy()
        self.epsilon = epsilon
        self.privacy_preserving_instances = {}
    
    def add_privacy_instance(self, column, column_type):
        """
        Add a PrivacyPreserving instance for a specific column.
        
        Parameters:
        column (str): The column name to which privacy should be applied.
        column_type (str): The type of column, either 'categorical' or 'numerical'.
        """
        if column_type == 'categorical':
            self.privacy_preserving_instances[column] = PrivacyPreserving(self.df, column, self.epsilon)
    
    def apply_privacy_to_column(self, column, method_name):
        """
        Apply the selected privacy method to a specific column.

        Parameters:
        column (str): The column name to apply the privacy method.
        method_name (str): The name of the privacy method to use.
                           For categorical: "direct_encoding_privacy", "optimized_unary_encoding_privacy", "rappor_privacy".
                           For numerical: "duchi_solution", "piecewise_mechanism".

        Returns:
        pd.DataFrame: The dataframe with the specified column privatized.
        """
        if column in self.privacy_preserving_instances:
            # Handle categorical columns
            instance = self.privacy_preserving_instances[column]
            if method_name in ["direct_encoding_privacy", "optimized_unary_encoding_privacy", "rappor_privacy"]:
                self.df[column] = instance.apply_privacy_method(method_name)
            else:
                raise ValueError(f"Invalid method name {method_name} provided for categorical column {column}.")
        else:
            # Handle numerical columns
            if method_name == "duchi_solution":
                self.df[column] = AlgorithmDP.duchi_solution(self.df[column].values, self.epsilon)
            elif method_name == "piecewise_mechanism":
                self.df[column] = AlgorithmDP.piecewise_mechanism(self.df[column].values, self.epsilon)
            else:
                raise ValueError(f"Invalid method name {method_name} provided for numerical column {column}.")

        return self.df

    @staticmethod
    def duchi_solution(t_i_vector, epsilon):
        t_i_tensor = torch.tensor(t_i_vector, dtype=torch.float32)
        t_i_tensor = torch.clamp(t_i_tensor, -1, 1)
        e_epsilon = torch.exp(torch.tensor(epsilon, dtype=torch.float32))
        prob = (e_epsilon - 1) / (2 * e_epsilon + 2) * t_i_tensor + 0.5
        u = torch.bernoulli(prob.clone().detach())
        t_i_star = torch.where(u == 1, 
                               (e_epsilon + 1) / (e_epsilon - 1),
                               (1 - e_epsilon) / (e_epsilon + 1))
        return t_i_star.numpy()

    @staticmethod
    def piecewise_mechanism(t_i_vector, epsilon):
        t_i_tensor = torch.tensor(t_i_vector, dtype=torch.float32)
        t_i_tensor = torch.clamp(t_i_tensor, -1, 1)
        C = (torch.exp(torch.tensor(epsilon / 2)) + 1) / (torch.exp(torch.tensor(epsilon / 2)) - 1)
        def l(t_i): return (C + 1) / 2 * t_i - (C - 1) / 2
        def r(t_i): return l(t_i) + C - 1
        x = torch.rand(t_i_tensor.shape)
        threshold = torch.exp(torch.tensor(epsilon / 2)) / (torch.exp(torch.tensor(epsilon / 2)) + 1)
        t_i_star = torch.empty(t_i_tensor.shape, dtype=torch.float32)
        for i in range(t_i_tensor.shape[0]):
            l_val, r_val = l(t_i_tensor[i]), r(t_i_tensor[i])
            if x[i] < threshold:
                if l_val >= r_val: r_val = l_val + 1e-5
                t_i_star[i] = torch.distributions.Uniform(l_val, r_val).sample()
            else:
                if torch.rand(1) < 0.5:
                    if -C >= l_val: l_val = -C + 1e-5
                    t_i_star[i] = torch.distributions.Uniform(-C, l_val).sample()
                else:
                    if r_val >= C: r_val = C - 1e-5
                    t_i_star[i] = torch.distributions.Uniform(r_val, C).sample()
        return t_i_star.numpy()

class PrivacyPreserving:
    def __init__(self, df, column, epsilon):
        self.df = df.copy()
        self.column = column
        self.epsilon = epsilon
        self.categories = df[column].unique()
        self.category_to_index = {category: i for i, category in enumerate(self.categories)}
        self.index_to_category = {i: category for i, category in enumerate(self.categories)}
        self.d = len(self.categories)
    
    def apply_privacy_method(self, method_name):
        if method_name == "direct_encoding_privacy":
            return self.direct_encoding_privacy()[self.column]  # Return only the column
        elif method_name == "optimized_unary_encoding_privacy":
            return self.optimized_unary_encoding_privacy()[self.column]  # Return only the column
        elif method_name == "rappor_privacy":
            return self.rappor_privacy()[self.column]  # Return only the column
        else:
            raise ValueError(f"Invalid privacy method name: {method_name}")

    def direct_encoding_privacy(self):
        def direct_encoding(value, d, epsilon):
            p = np.exp(epsilon) / (np.exp(epsilon) + d - 1)
            q = 1 / (np.exp(epsilon) + d - 1)
            
            probabilities = np.full(d, q)
            probabilities[self.category_to_index[value]] = p
            privatized_index = np.random.choice(d, p=probabilities)
            return self.index_to_category[privatized_index]
        
        df_priv = self.df.copy()
        df_priv[self.column] = df_priv[self.column].apply(lambda x: direct_encoding(x, self.d, self.epsilon))
        return df_priv
    
    def optimized_unary_encoding_privacy(self):
        def optimized_unary_encoding(value, d, epsilon):
            p = 0.5
            q = 1 / (np.exp(epsilon) + 1)
            
            binary_vector = np.zeros(d)
            binary_vector[self.category_to_index[value]] = 1
            
            perturbed_vector = np.zeros(d)
            for i in range(d):
                if binary_vector[i] == 1:
                    perturbed_vector[i] = np.random.choice([1, 0], p=[p, 1 - p])
                else:
                    perturbed_vector[i] = np.random.choice([1, 0], p=[q, 1 - q])
            
            indices_positivos = np.where(perturbed_vector == 1)[0]
            if len(indices_positivos) == 0:
                indices_positivos = [self.category_to_index[value]]  # Si no hay 1s, devolvemos el índice original
            
            privatized_index = np.random.choice(indices_positivos)
            return self.index_to_category[privatized_index]
        
        df_priv = self.df.copy()
        df_priv[self.column] = df_priv[self.column].apply(lambda x: optimized_unary_encoding(x, self.d, self.epsilon))
        return df_priv
    
    def rappor_privacy(self):
        def rappor_encode(value, d, f=0.5):
            binary_vector = np.zeros(d)
            binary_vector[self.category_to_index[value]] = 1
            
            perturbed_vector = np.zeros(d)
            for i in range(d):
                if binary_vector[i] == 1:
                    perturbed_vector[i] = np.random.choice([1, 0], p=[1 - f, f])
                else:
                    perturbed_vector[i] = np.random.choice([1, 0], p=[f, 1 - f])
            
            indices_positivos = np.where(perturbed_vector == 1)[0]
            if len(indices_positivos) == 0:
                indices_positivos = [self.category_to_index[value]]  # Si no hay 1s, devolvemos el índice original
            
            privatized_index = np.random.choice(indices_positivos)
            return self.index_to_category[privatized_index]
        
        df_priv = self.df.copy()
        df_priv[self.column] = df_priv[self.column].apply(lambda x: rappor_encode(x, self.d))
        return df_priv






In [45]:
epsilon = 1.0 
categorical_columns = ['SEXO', 'Grupo_Edad']  
numerical_columns = ['Valor_Neto']  

dp_algorithm = AlgorithmDP(df, epsilon)

for col in categorical_columns:
    dp_algorithm.add_privacy_instance(col, 'categorical')

privatized_df = dp_algorithm.apply_privacy_to_column('SEXO', 'direct_encoding_privacy')
privatized_df = dp_algorithm.apply_privacy_to_column('Grupo_Edad', 'direct_encoding_privacy')
privatized_df = dp_algorithm.apply_privacy_to_column('Valor_Neto', 'duchi_solution')

In [46]:
privatized_df.head(2)

Unnamed: 0_level_0,EPS,NUM_IDE,FECHA_NACI,SEXO,DPTO,MUNI,DIAS_COMP,V8,id,Valor_Neto_Chunk,...,ARTROSIS,RENAL_OTRA,RENAL_RENALDELARGADURACION,TRANSPLANTE,RENAL_INSUFICIENCIARENALOTRA,RENAL_INSUFICIENCIARENALCRONICA,ANOMALIASGENETICASYCONGENITAS,CANCER_TERAPIACANCER,Grupo_Edad,Zona
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1184873,EPS002,7B7119A8-7D83-4DAF-ABC9-D33642AF26FD,1999-05-14 00:00:00.000,M,50,1,360,B,EPS002-7B7119A8-7D83-4DAF-ABC9-D33642AF26FD,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,N
1587467,EPS013,D36643BD-6CCC-4608-94D8-6B167B226CB5,1988-08-12 00:00:00.000,F,76,1,336,C,EPS013-D36643BD-6CCC-4608-94D8-6B167B226CB5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,C


In [16]:
df.head()

Unnamed: 0_level_0,EPS,NUM_IDE,FECHA_NACI,SEXO,DPTO,MUNI,DIAS_COMP,V8,id,Valor_Neto_Chunk,...,ARTROSIS,RENAL_OTRA,RENAL_RENALDELARGADURACION,TRANSPLANTE,RENAL_INSUFICIENCIARENALOTRA,RENAL_INSUFICIENCIARENALCRONICA,ANOMALIASGENETICASYCONGENITAS,CANCER_TERAPIACANCER,Grupo_Edad,Zona
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1184873,EPS002,7B7119A8-7D83-4DAF-ABC9-D33642AF26FD,1999-05-14 00:00:00.000,F,50,1,360,B,EPS002-7B7119A8-7D83-4DAF-ABC9-D33642AF26FD,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,N
1587467,EPS013,D36643BD-6CCC-4608-94D8-6B167B226CB5,1988-08-12 00:00:00.000,M,76,1,336,C,EPS013-D36643BD-6CCC-4608-94D8-6B167B226CB5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,C
753062,EPS017,919EBF57-FA41-4DD2-AF8A-CB237B47685E,1996-03-15 00:00:00.000,F,11,1,333,B,EPS017-919EBF57-FA41-4DD2-AF8A-CB237B47685E,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,C
1802563,EPS002,18A7B56B-23F0-4319-BA50-7A497F3FCF18,1998-04-23 00:00:00.000,M,76,1,84,B,EPS002-18A7B56B-23F0-4319-BA50-7A497F3FCF18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,C
1190692,EPS037,C5BD60C4-B826-4706-9610-E01F83A7C6AB,1990-10-20 00:00:00.000,F,15,776,180,B,EPS037-C5BD60C4-B826-4706-9610-E01F83A7C6AB,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,N


In [24]:
import pandas as pd

def compare_distribution(original_df, privatized_df, column):
    """
    Compara la distribución de una variable categórica entre un DataFrame original y uno privatizado.

    Parameters:
    original_df (pd.DataFrame): El DataFrame original sin privatización.
    privatized_df (pd.DataFrame): El DataFrame privatizado.
    column (str): El nombre de la columna categórica a comparar.

    Returns:
    pd.DataFrame: Un DataFrame que muestra la comparación de distribuciones en valor absoluto y porcentaje.
    """
    original_counts = original_df[column].value_counts().sort_index()
    original_percentage = original_counts / original_counts.sum() * 100

    privatized_counts = privatized_df[column].value_counts().sort_index()
    privatized_percentage = privatized_counts / privatized_counts.sum() * 100

    comparison_df = pd.DataFrame({
        'Original Count': original_counts,
        'Original %': original_percentage,
        'Privatized Count': privatized_counts,
        'Privatized %': privatized_percentage
    })

    comparison_df = comparison_df.fillna(0)

    comparison_df['Original %'] = comparison_df['Original %'].round(2)
    comparison_df['Privatized %'] = comparison_df['Privatized %'].round(2)
    
    return comparison_df



In [25]:
comparison_df = compare_distribution(df, privatized_df, 'SEXO')
print(comparison_df)

      Original Count  Original %  Privatized Count  Privatized %
SEXO                                                            
F             504632       50.65            501642         50.35
M             491602       49.35            494592         49.65


In [34]:
privatized_df = dp_algorithm.apply_privacy_to_column('Grupo_Edad', 'direct_encoding_privacy')
comparison_df = compare_distribution(df, privatized_df, 'Grupo_Edad')
print(comparison_df)

            Original Count  Original %  Privatized Count  Privatized %
Grupo_Edad                                                            
1                    11023        1.11          196511.0         19.73
10                   22602        2.27               0.0          0.00
11                   17587        1.77           17587.0          1.77
12                   27224        2.73               0.0          0.00
2                    54074        5.43               0.0          0.00
3                   156481       15.71          183705.0         18.44
4                    67207        6.75          119379.0         11.98
5                   447291       44.90          447291.0         44.90
6                    65305        6.56               0.0          0.00
7                    53909        5.41               0.0          0.00
8                    41770        4.19               0.0          0.00
9                    31761        3.19           31761.0          3.19


In [27]:
privatized_df = dp_algorithm.apply_privacy_to_column('Grupo_Edad', 'optimized_unary_encoding_privacy')
comparison_df = compare_distribution(df, privatized_df, 'Grupo_Edad')
print(comparison_df)

            Original Count  Original %  Privatized Count  Privatized %
Grupo_Edad                                                            
1                    11023        1.11           22602.0          2.27
10                   22602        2.27           64932.0          6.52
11                   17587        1.77               0.0          0.00
12                   27224        2.73               0.0          0.00
2                    54074        5.43          132512.0         13.30
3                   156481       15.71               0.0          0.00
4                    67207        6.75          183705.0         18.44
5                   447291       44.90               0.0          0.00
6                    65305        6.56           17587.0          1.77
7                    53909        5.41           54074.0          5.43
8                    41770        4.19               0.0          0.00
9                    31761        3.19          520822.0         52.28


In [28]:
privatized_df = dp_algorithm.apply_privacy_to_column('Grupo_Edad', 'rappor_privacy')
comparison_df = compare_distribution(df, privatized_df, 'Grupo_Edad')
print(comparison_df)

            Original Count  Original %  Privatized Count  Privatized %
Grupo_Edad                                                            
1                    11023        1.11          159736.0         16.03
10                   22602        2.27               0.0          0.00
11                   17587        1.77           49348.0          4.95
12                   27224        2.73               0.0          0.00
2                    54074        5.43               0.0          0.00
3                   156481       15.71               0.0          0.00
4                    67207        6.75           54074.0          5.43
5                   447291       44.90           11023.0          1.11
6                    65305        6.56               0.0          0.00
7                    53909        5.41               0.0          0.00
8                    41770        4.19          657681.0         66.02
9                    31761        3.19           64372.0          6.46


In [26]:
comparison_df = compare_distribution(df, privatized_df, 'Grupo_Edad')
print(comparison_df)

            Original Count  Original %  Privatized Count  Privatized %
Grupo_Edad                                                            
1                    11023        1.11           11023.0          1.11
10                   22602        2.27          119379.0         11.98
11                   17587        1.77               0.0          0.00
12                   27224        2.73               0.0          0.00
2                    54074        5.43           67207.0          6.75
3                   156481       15.71           67413.0          6.77
4                    67207        6.75          520822.0         52.28
5                   447291       44.90               0.0          0.00
6                    65305        6.56               0.0          0.00
7                    53909        5.41           53909.0          5.41
8                    41770        4.19               0.0          0.00
9                    31761        3.19          156481.0         15.71


# Modelo

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, r2_score

def perform_regression(df, target_column, categorical_columns, numerical_columns):
    X_categorical = pd.get_dummies(df[categorical_columns], drop_first=True)  # One-hot encode categorical variables
    X_numerical = df[numerical_columns]
    X = pd.concat([X_categorical, X_numerical], axis=1)
    y = df[target_column]

    # Dividir en conjunto de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Configurar y entrenar el modelo de regresión Lasso
    model = Lasso(alpha=0.1)
    model.fit(X_train, y_train)
    
    # Predicciones y métricas
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    metrics = {
        'R2_train': r2_score(y_train, y_train_pred),
        'R2_test': r2_score(y_test, y_test_pred),
        'MAE_train': mean_absolute_error(y_train, y_train_pred),
        'MAE_test': mean_absolute_error(y_test, y_test_pred)
    }
    
    return metrics



In [50]:
epsilon = 1.0 
categorical_columns = ['SEXO', 'Grupo_Edad']  
numerical_columns = ['DIAS_COMP']  
df['DIAS_COMP'] = df['DIAS_COMP'].astype(int)
dp_algorithm = AlgorithmDP(df, epsilon)

for col in categorical_columns:
    dp_algorithm.add_privacy_instance(col, 'categorical')

privatized_df = dp_algorithm.apply_privacy_to_column('SEXO', 'direct_encoding_privacy')
privatized_df = dp_algorithm.apply_privacy_to_column('Grupo_Edad', 'direct_encoding_privacy')
# Convertir DIAS_COMP a int después de la privatización



privatized_df = dp_algorithm.apply_privacy_to_column('DIAS_COMP', 'duchi_solution')

In [53]:

metrics_priv = perform_regression(privatized_df, 'Valor_Neto', categorical_columns, numerical_columns)
metrics_original = perform_regression(df, 'Valor_Neto', categorical_columns, numerical_columns)


results = pd.DataFrame({
    'Métrica': ['R2_train', 'R2_test', 'MAE_train', 'MAE_test'],
    'Original': [metrics_original['R2_train'], metrics_original['R2_test'], metrics_original['MAE_train'], metrics_original['MAE_test']],
    'Privatizado': [metrics_priv['R2_train'], metrics_priv['R2_test'], metrics_priv['MAE_train'], metrics_priv['MAE_test']]
})

print("\nResultados de la regresión lineal (Lasso) comparando datos originales y privatizados:")
print(results)




  model = cd_fast.enet_coordinate_descent(



Resultados de la regresión lineal (Lasso) comparando datos originales y privatizados:
     Métrica       Original    Privatizado
0   R2_train       0.000786       0.000488
1    R2_test       0.057176       0.036358
2  MAE_train  209617.408227  222623.805780
3   MAE_test  203574.063166  216509.863087
