In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from interpretableai import iai

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn import preprocessing

In [2]:
def bmi_segment(x):
    if x < 18.5:
        return 'BMI -18'
    if x < 25:
        return 'BMI 18-25'
    if x < 30:
        return 'BMI 25-30'
    return 'BMI 30-'
  
def dxa_levels(x):
  if x < 60: return 0
  return 1
  if x < 25: return '1'
  if x < 50: return '2'
  if x < 75: return '3'
  return '4'
  
def impute_then_predict(method, seed, max_depth):
    imputer = iai.ImputationLearner(
        method=method,
        cluster=True,
        cluster_max_size=100,
        random_seed=seed,
    )
    
    train_X_imputed =  preprocessing.scale(imputer.fit_transform(train_X))
    test_X_imputed =  preprocessing.scale(imputer.transform(test_X))

    logReg = LogisticRegression(random_state=seed, class_weight='balanced').fit(train_X_imputed, train_y)
    results = {
        'method': method,
        'ins_acc':logReg.score(train_X_imputed, train_y),
        'oos_acc': logReg.score(test_X_imputed, test_y),
        'benchmark_oos_acc': (test_y == stats.mode(train_y)[0]).sum() / test_y.shape[0],
        'ins_auc': metrics.roc_auc_score(train_y, logReg.predict_proba(train_X_imputed)[:, 1]),
        'oos_auc': metrics.roc_auc_score(test_y, logReg.predict_proba(test_X_imputed)[:, 1]),
        'benchmark_oos_auc': metrics.roc_auc_score(test_y, [stats.mode(train_y)[0]] * test_y.shape[0]),
    }
    
    confusion_matrix = metrics.confusion_matrix(test_y, logReg.predict(test_X_imputed))
    
    roc_curve = []
#     roc_curve = iai.ROCCurve(grid, test_X_imputed, test_y)
        
    return results, logReg, roc_curve, confusion_matrix
  
def spectrum_mean_kernel(df, window=50, overlap=0):
  aux = df.copy()
  for i in range(int(331 / (window - overlap)) + 1):
    start = (window - overlap) * i
    end = start + window
    if end > 331:
      end = 331
    aux['spectrum_mean_' + str(start) + ':' + str(end)] = aux[spectrum_cols[start:end]].mean(axis=1)
#     segments.append(spectrum_cols[start:end])
  return aux

In [3]:
df = pd.read_csv('data/scio_dxa_densidad_27_11.csv')
df['ubicacion'] = df['ubicacion'].astype(str)

folio_filter = pd.read_csv('folio_filter.csv')['folio']
df = df.loc[~df['folio'].isin(folio_filter)]

In [4]:
target = 'valor_dxa'
bin_target = 'bin_' + target
s_target = 'segment_' + target

features = ['copasosten',
            'edad',
#             'peso',
            'indicedemasacorporal',
#             'talla',
            'fitzpatrickcolor'
           ]

spectrum_cols = [col for col in df.columns if col.startswith('spectrum')]
sample_raw_cols = [col for col in df.columns if col.startswith('sample_raw')]

In [5]:
df_clean = (df
            .dropna(subset=[target])
            .loc[lambda x: ~x['ubicacion'].str.contains('Pezon')]
            [['folio', 'mama', target] + features + spectrum_cols]
            .rename(columns={'indicedemasacorporal': 'BMI'})
            .assign(**{bin_target: lambda x: x[target] > 50,#np.quantile(x[target], 0.75),
                       s_target: lambda x: x[target].map(dxa_levels),
                       "BMI_segment": lambda x: x['BMI'].map(bmi_segment)})
            .drop(columns=[target])
           )

In [6]:
features_final = [f for f in features + ['BMI', 'BMI_segment'] if f != 'indicedemasacorporal']

categorical_feats = ['copasosten', 
                     'fitzpatrickcolor',
                     'BMI_segment'
                    ]

agg_dict = {**{f: 'first' for f in features_final + [bin_target, s_target]},
            **{s: 'mean' for s in spectrum_cols}}

df_clean_agg = (
    df_clean
        .groupby(['folio', 'mama'], as_index=False)
        .agg(agg_dict)
        .assign(**{i: (lambda y: lambda x: x[y].astype('category'))(i) for i in categorical_feats})
        .assign(spectrum_mean = lambda x: x[spectrum_cols].mean(axis=1))
        .pipe(spectrum_mean_kernel, overlap=25)
)

In [7]:
# Select features

features_final = ['copasosten',
                  'edad',
                  'fitzpatrickcolor',
                  'BMI'] +\
                [i for i in df_clean_agg.columns if i.startswith('spectrum_mean_')]

df_clean_agg = (
  df_clean_agg[features_final + [s_target]]
  .join(pd.get_dummies(df_clean_agg[[i for i in categorical_feats if i in features_final]]))
  .drop(columns=[i for i in categorical_feats if i in features_final])
)

In [8]:
X = df_clean_agg.drop(columns=[s_target])
y = df_clean_agg[s_target]

(train_X, train_y), (test_X, test_y) = iai.split_data('classification', X, y, seed=1)

In [9]:
results, model, roc_curve, confusion_matrix = impute_then_predict('mean', 1, range(2, 8))

In [13]:
results, confusion_matrix

({'method': 'mean',
  'ins_acc': 0.7692307692307693,
  'oos_acc': 0.7704918032786885,
  'benchmark_oos_acc': 0.7377049180327869,
  'ins_auc': 0.8470774091627171,
  'oos_auc': 0.8538194444444445,
  'benchmark_oos_auc': 0.5},
 array([[66, 24],
        [ 4, 28]], dtype=int64))

In [10]:
results, confusion_matrix

({'method': 'mean',
  'ins_acc': 0.7692307692307693,
  'oos_acc': 0.7704918032786885,
  'benchmark_oos_acc': 0.7377049180327869,
  'ins_auc': 0.8470774091627171,
  'oos_auc': 0.8538194444444445,
  'benchmark_oos_auc': 0.5},
 array([[66, 24],
        [ 4, 28]], dtype=int64))

In [10]:
results, confusion_matrix

({'method': 'mean',
  'ins_acc': 0.7097902097902098,
  'oos_acc': 0.7622950819672131,
  'benchmark_oos_acc': 0.7377049180327869,
  'ins_auc': 0.8203159557661928,
  'oos_auc': 0.8289930555555556,
  'benchmark_oos_auc': 0.5},
 array([[65, 25],
        [ 4, 28]], dtype=int64))

In [11]:
pd.DataFrame({'feature': list(train_X.columns), 'coefficient': model.feature_importances_})

AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

In [43]:
results

{'method': 'mean',
 'ins_acc': 0.7692307692307693,
 'oos_acc': 0.7704918032786885,
 'benchmark_oos_acc': 0.7377049180327869,
 'ins_auc': 0.8470774091627171,
 'oos_auc': 0.8538194444444445,
 'benchmark_oos_auc': 0.5}

In [44]:
confusion_matrix

array([[66, 24],
       [ 4, 28]], dtype=int64)

In [53]:
pd.DataFrame({'feature': list(train_X.columns), 'coefficient': model.coef_[0]})

Unnamed: 0,feature,coefficient
0,edad,0.144199
1,BMI,-1.464124
2,copasosten_A,0.203707
3,copasosten_B,0.298666
4,copasosten_C,-0.434942
5,copasosten_D,-0.185124
6,fitzpatrickcolor_II,0.213402
7,fitzpatrickcolor_III,-0.312149
8,fitzpatrickcolor_IV,0.184857
9,fitzpatrickcolor_V,-0.240752


In [46]:
model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 1,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}