In [11]:
import pandas as pd
import numpy as np
from scipy import stats

from interpretableai import iai

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn import preprocessing

In [26]:
def bmi_segment(x):
    if x < 18.5:
        return 'BMI -18'
    if x < 25:
        return 'BMI 18-25'
    if x < 30:
        return 'BMI 25-30'
    return 'BMI 30-'
  
def dxa_levels(x):
  if x < 60: return 0
  return 1
  if x < 25: return '1'
  if x < 50: return '2'
  if x < 75: return '3'
  return '4'
  
def impute_then_predict(method, seed, max_depth):
    imputer = iai.ImputationLearner(
        method=method,
        cluster=True,
        cluster_max_size=100,
        random_seed=seed,
    )
    
    train_X_imputed =  preprocessing.scale(imputer.fit_transform(train_X))
    test_X_imputed =  preprocessing.scale(imputer.transform(test_X))

    logReg = LogisticRegression(random_state=seed).fit(train_X_imputed, train_y)
    results = {
        'method': method,
        'ins_acc':logReg.score(train_X_imputed, train_y),
        'oos_acc': logReg.score(test_X_imputed, test_y),
        'benchmark_oos_acc': (test_y == stats.mode(train_y)[0]).sum() / test_y.shape[0],
        'ins_auc': metrics.roc_auc_score(train_y, logReg.predict_proba(train_X_imputed)[:, 1]),
        'oos_auc': metrics.roc_auc_score(test_y, logReg.predict_proba(test_X_imputed)[:, 1]),
        'benchmark_oos_auc': metrics.roc_auc_score(test_y, [stats.mode(train_y)[0]] * test_y.shape[0]),
    }
    
    confusion_matrix = metrics.confusion_matrix(test_y, logReg.predict(test_X_imputed))
    
    roc_curve = []
#     roc_curve = iai.ROCCurve(grid, test_X_imputed, test_y)
        
    return results, logReg, roc_curve, confusion_matrix
  
def spectrum_mean_kernel(df, window=50, overlap=0):
  aux = df.copy()
  for i in range(int(331 / (window - overlap)) + 1):
    start = (window - overlap) * i
    end = start + window
    if end > 331:
      end = 331
    aux['spectrum_mean_' + str(start) + ':' + str(end)] = aux[spectrum_cols[start:end]].mean(axis=1)
#     segments.append(spectrum_cols[start:end])
  return aux

In [4]:
df = pd.read_csv('data/scio_dxa_densidad_27_11.csv')
df['ubicacion'] = df['ubicacion'].astype(str)

folio_filter = pd.read_csv('folio_filter.csv')['folio']
df = df.loc[~df['folio'].isin(folio_filter)]

In [5]:
target = 'valor_dxa'
bin_target = 'bin_' + target
s_target = 'segment_' + target

features = ['copasosten',
            'edad',
#             'peso',
            'indicedemasacorporal',
#             'talla',
            'fitzpatrickcolor'
           ]

spectrum_cols = [col for col in df.columns if col.startswith('spectrum')]
sample_raw_cols = [col for col in df.columns if col.startswith('sample_raw')]

In [6]:
df_clean = (df
            .dropna(subset=[target])
            .loc[lambda x: ~x['ubicacion'].str.contains('Pezon')]
            [['folio', 'mama', target] + features + spectrum_cols]
            .rename(columns={'indicedemasacorporal': 'BMI'})
            .assign(**{bin_target: lambda x: x[target] > 50,#np.quantile(x[target], 0.75),
                       s_target: lambda x: x[target].map(dxa_levels),
                       "BMI_segment": lambda x: x['BMI'].map(bmi_segment)})
            .drop(columns=[target])
           )

In [7]:
features_final = [f for f in features + ['BMI', 'BMI_segment'] if f != 'indicedemasacorporal']

categorical_feats = ['copasosten', 
                     'fitzpatrickcolor',
                     'BMI_segment'
                    ]

agg_dict = {**{f: 'first' for f in features_final + [bin_target, s_target]},
            **{s: 'mean' for s in spectrum_cols}}

df_clean_agg = (
    df_clean
        .groupby(['folio', 'mama'], as_index=False)
        .agg(agg_dict)
        .assign(**{i: (lambda y: lambda x: x[y].astype('category'))(i) for i in categorical_feats})
        .assign(spectrum_mean = lambda x: x[spectrum_cols].mean(axis=1))
        .pipe(spectrum_mean_kernel, overlap=25)
)

In [8]:
# Select features

features_final = ['copasosten',
                  'edad',
                  'fitzpatrickcolor',
                  'BMI'] + \
                [i for i in df_clean_agg.columns if i.startswith('spectrum_mean_')]

df_clean_agg = (
  df_clean_agg[features_final + [s_target]]
  .join(pd.get_dummies(df_clean_agg[[i for i in categorical_feats if i in features_final]]))
  .drop(columns=[i for i in categorical_feats if i in features_final])
)

In [9]:
X = df_clean_agg.drop(columns=[s_target])
y = df_clean_agg[s_target]

(train_X, train_y), (test_X, test_y) = iai.split_data('classification', X, y, seed=1)

In [68]:
imputer = iai.ImputationLearner(
    method='mean',
    cluster=True,
    cluster_max_size=100,
    random_seed=1,
)

train_X_imputed =  preprocessing.scale(imputer.fit_transform(train_X))
test_X_imputed =  preprocessing.scale(imputer.transform(test_X))

logReg = LogisticRegression(random_state=0).fit(train_X_imputed, train_y)

In [69]:
logReg.score(test_X_imputed, test_y)
confusion_matrix(test_y, logReg.predict(test_X_imputed))

array([[81,  9],
       [17, 15]], dtype=int64)

In [27]:
results, model, roc_curve, confusion_matrix = impute_then_predict('mean', 1, range(2, 8))

In [28]:
results

{'method': 'mean',
 'ins_acc': 0.7937062937062938,
 'oos_acc': 0.7868852459016393,
 'benchmark_oos_acc': 0.7377049180327869,
 'ins_auc': 0.8457503949447078,
 'oos_auc': 0.8555555555555555,
 'benchmark_oos_auc': 0.5}

In [29]:
confusion_matrix

array([[81,  9],
       [17, 15]], dtype=int64)

In [39]:
pd.DataFrame({'feature': list(train_X.columns), 'coefficient': model.coef_[0]})

Unnamed: 0,feature,coefficient
0,edad,0.221252
1,BMI,-1.399473
2,spectrum_mean_0:50,0.828863
3,spectrum_mean_25:75,0.48728
4,spectrum_mean_50:100,0.180586
5,spectrum_mean_75:125,-0.267701
6,spectrum_mean_100:150,-0.509057
7,spectrum_mean_125:175,-0.003733
8,spectrum_mean_150:200,0.890817
9,spectrum_mean_175:225,0.26991


In [38]:
model.coef_,

(array([[ 0.22125203, -1.39947305,  0.8288633 ,  0.4872801 ,  0.18058585,
         -0.26770089, -0.50905674, -0.00373348,  0.89081727,  0.2699102 ,
         -0.93449483, -0.97385853, -0.37980324,  0.18448561,  0.16670942,
         -0.07542483,  0.13846926,  0.25571234, -0.34223824, -0.17748842,
          0.14225494, -0.29924344,  0.21412162, -0.16327923]]),)