In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../datasets/socialdiagnosis/data/SocialDiagnosis2011.csv', sep=';')

In [2]:
df.head()

Unnamed: 0,sex,age,marital,income,ls,smoke
0,FEMALE,57,MARRIED,800.0,PLEASED,NO
1,MALE,20,SINGLE,350.0,MOSTLY SATISFIED,NO
2,FEMALE,18,SINGLE,,PLEASED,NO
3,FEMALE,78,WIDOWED,900.0,MIXED,NO
4,FEMALE,54,MARRIED,1500.0,MOSTLY SATISFIED,YES


In [3]:
from synthpop import MissingDataHandler, DataProcessor, CARTMethod

In [4]:
# 1. Initiate metadata
metadata = MissingDataHandler()

# 1.1 Get data types
column_dtypes = metadata.get_column_dtypes(df)
print("Column Data Types:", column_dtypes)

Column Data Types: {'sex': 'categorical', 'age': 'numerical', 'marital': 'categorical', 'income': 'numerical', 'ls': 'categorical', 'smoke': 'categorical'}


In [5]:
# 2. Missing data
print(df.isnull().sum())

sex          0
age          0
marital      9
income     683
ls           8
smoke       10
dtype: int64


In [6]:
# 2.1 Detect type of missingness
missingness_dict = metadata.detect_missingness(df)
print("Detected missingness type:", missingness_dict)

Detected missingness type: {'marital': 'MAR', 'income': 'MAR', 'ls': 'MAR', 'smoke': 'MAR'}


In [8]:
# 2.2 Impute missing values
df_imputed = metadata.apply_imputation(df, missingness_dict)

print(df_imputed.isnull().sum())

sex        0
age        0
marital    0
income     0
ls         0
smoke      0
dtype: int64


In [9]:
# 3. Instantiate the DataProcessor with column_dtypes
processor = DataProcessor(column_dtypes)

# 3.1 Preprocess the data: transforms raw data into a numerical format
processed_data = processor.preprocess(df)
print("Processed Data:")
display(processed_data.head())

Processed Data:


Unnamed: 0,sex,age,marital,income,ls,smoke
0,0,0.503625,3,-0.480608,4,0
1,1,-1.495187,4,-0.834521,3,0
2,0,-1.603231,4,,4,0
3,0,1.638086,5,-0.401961,1,0
4,0,0.341559,3,0.069923,3,1


In [None]:
# 4. Fit the CART method
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
cart.fit(processed_data)

ERROR:synthpop.method.cart:Error fitting model for column 'income': Input y contains NaN.


In [None]:
from synthpop.metrics import (
    MetricsReport,
    EfficacyMetrics,
    DisclosureProtection
)