In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from interpretableai import iai

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import preprocessing

In [2]:
def bmi_segment(x):
    if x < 18.5:
        return 'BMI -18'
    if x < 25:
        return 'BMI 18-25'
    if x < 30:
        return 'BMI 25-30'
    return 'BMI 30-'
  
def dxa_levels(x):
  if x < 60: return 0
  return 1
  if x < 25: return '1'
  if x < 50: return '2'
  if x < 75: return '3'
  return '4'
  
def impute_then_predict(method, seed, kernel):
    imputer = iai.ImputationLearner(
        method=method,
        cluster=True,
        cluster_max_size=100,
        random_seed=seed,
    )
    
    train_X_imputed =  preprocessing.scale(imputer.fit_transform(train_X))
    test_X_imputed =  preprocessing.scale(imputer.transform(test_X))

    model = SVC(kernel=kernel, 
                random_state=seed, 
                class_weight='balanced')
    model.fit(train_X_imputed, train_y)
    
    results = {
        'method': method,
        'ins_acc': model.score(train_X_imputed, train_y),
        'oos_acc': model.score(test_X_imputed, test_y),
        'benchmark_oos_acc': (test_y == stats.mode(train_y)[0]).sum() / test_y.shape[0],
#         'ins_auc': metrics.roc_auc_score(train_y, model.predict_proba(train_X_imputed)[:, 1]),
#         'oos_auc': metrics.roc_auc_score(test_y, model.predict_proba(test_X_imputed)[:, 1]),
#         'benchmark_oos_auc': metrics.roc_auc_score(test_y, [stats.mode(train_y)[0]] * test_y.shape[0]),
    }
    
    confusion_matrix = metrics.confusion_matrix(test_y, model.predict(test_X_imputed))
    
    roc_curve = []
#     roc_curve = iai.ROCCurve(grid, test_X_imputed, test_y)
        
    return results, model, roc_curve, confusion_matrix
  
def spectrum_mean_kernel(df, window=50, overlap=0):
  aux = df.copy()
  for i in range(int(331 / (window - overlap)) + 1):
    start = (window - overlap) * i
    end = start + window
    if end > 331:
      end = 331
    aux['spectrum_mean_' + str(start) + ':' + str(end)] = aux[spectrum_cols[start:end]].mean(axis=1)
#     segments.append(spectrum_cols[start:end])
  return aux

In [3]:
df = pd.read_csv('data/scio_dxa_densidad_27_11.csv')
df['ubicacion'] = df['ubicacion'].astype(str)

folio_filter = pd.read_csv('folio_filter.csv')['folio']
df = df.loc[~df['folio'].isin(folio_filter)]

In [4]:
target = 'valor_dxa'
bin_target = 'bin_' + target
s_target = 'segment_' + target

features = ['copasosten',
            'edad',
#             'peso',
            'indicedemasacorporal',
#             'talla',
            'fitzpatrickcolor'
           ]

spectrum_cols = [col for col in df.columns if col.startswith('spectrum')]
sample_raw_cols = [col for col in df.columns if col.startswith('sample_raw')]

In [5]:
df_clean = (df
            .dropna(subset=[target])
            .loc[lambda x: ~x['ubicacion'].str.contains('Pezon')]
            [['folio', 'mama', target] + features + spectrum_cols]
            .rename(columns={'indicedemasacorporal': 'BMI'})
            .assign(**{bin_target: lambda x: x[target] > 50,#np.quantile(x[target], 0.75),
                       s_target: lambda x: x[target].map(dxa_levels),
                       "BMI_segment": lambda x: x['BMI'].map(bmi_segment)})
            .drop(columns=[target])
           )

In [6]:
features_final = [f for f in features + ['BMI', 'BMI_segment'] if f != 'indicedemasacorporal']

categorical_feats = ['copasosten', 
                     'fitzpatrickcolor',
                     'BMI_segment'
                    ]

agg_dict = {**{f: 'first' for f in features_final + [bin_target, s_target]},
            **{s: 'mean' for s in spectrum_cols}}

df_clean_agg = (
    df_clean
        .groupby(['folio', 'mama'], as_index=False)
        .agg(agg_dict)
        .assign(**{i: (lambda y: lambda x: x[y].astype('category'))(i) for i in categorical_feats})
        .assign(spectrum_mean = lambda x: x[spectrum_cols].mean(axis=1))
        .pipe(spectrum_mean_kernel, window=3, overlap=1)
)

In [16]:
# Select features

features_final = [i for i in df_clean_agg.columns if i.startswith('spectrum_mean_')] +\
                 [
                  'copasosten',
                  'edad',
                  'fitzpatrickcolor',
                  'BMI'
                  ]
                

df_final = (
  df_clean_agg[features_final + [s_target]]
  .join(pd.get_dummies(df_clean_agg[[i for i in categorical_feats if i in features_final]]))
  .drop(columns=[i for i in categorical_feats if i in features_final])
)

In [17]:
X = df_final.drop(columns=[s_target])
y = df_final[s_target]

(train_X, train_y), (test_X, test_y) = iai.split_data('classification', X, y, seed=1)

In [18]:
results, model, roc_curve, confusion_matrix = impute_then_predict('mean', 1, 'linear')

In [19]:
results, confusion_matrix

({'method': 'mean',
  'ins_acc': 0.6993006993006993,
  'oos_acc': 0.7377049180327869,
  'benchmark_oos_acc': 0.7377049180327869},
 array([[61, 29],
        [ 3, 29]], dtype=int64))

In [88]:
results, confusion_matrix

{'method': 'mean',
 'ins_acc': 0.8006993006993007,
 'oos_acc': 0.8278688524590164,
 'benchmark_oos_acc': 0.7377049180327869}

array([[72, 18],
       [ 3, 29]], dtype=int64)

In [90]:
pd.DataFrame({'feature': list(train_X.columns), 'coefficient': model.coef_[0]})

Unnamed: 0,feature,coefficient
0,edad,1.522897e-01
1,BMI,-1.061765e+00
2,spectrum_mean_0:3,-7.064979e-02
3,spectrum_mean_2:5,-2.112009e-02
4,spectrum_mean_4:7,9.373945e-03
...,...,...
171,copasosten_D,-7.315558e-02
172,fitzpatrickcolor_II,6.775120e-02
173,fitzpatrickcolor_III,-3.143846e-01
174,fitzpatrickcolor_IV,2.634483e-01
