In [4]:
# standard
import pandas as pd
import numpy as np
import os
import sklearn
import re
# pipelining
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.compose import ColumnTransformer

# constants
data_fname = "../data/all_risk_processed.csv"
os.getcwd()

'/Users/evan/Documents/School/MSDS/msds-capstone/vm-spinal-risk/vm-spinal-risk/notebooks'

Loading the data

In [15]:
data = pd.read_csv(os.path.join(os.getcwd(), data_fname), index_col=0)
data.shape

(799, 87)

In [17]:
class DropUnbalancedFeatures(BaseEstimator, TransformerMixin):
  """
  Custom scikit-learn transformer to remove features with unbalanced distribution.
  Author: Evan Yip

  Parameters
  ----------
  threshold : float, optional
      The threshold for feature unbalance. Features with a dominant class percentage
      exceeding this threshold will be dropped. Default is 0.9.
  verbose : bool, optional
      If True, print information about the features being removed during fit and
      transformation. Default is True.

  Attributes
  ----------
  threshold : float
      The specified threshold for feature unbalance.
  verbose : bool
      Flag indicating whether to print information during fit and transform.
  columns : list or None
      List of feature names if input data is a DataFrame, otherwise None.
  features_kept : list
      Indices of features kept after applying the unbalance threshold.
  features_dropped : list
      Indices of features dropped after applying the unbalance threshold.

  Methods
  -------
  fit(X, y=None)
      Fit the transformer to the input data, identifying unbalanced features.
  transform(X)
      Transform the input data by keeping only the balanced features.
  get_feature_names_out(input_features=None)
      Return the names of the output features.

  Notes
  -----
  - The transformer identifies features with a dominant class percentage exceeding
    the specified threshold and drops them.
  - The fit method prints information about the removed features if verbose is True.

  Examples
  --------
  >>> transformer = DropUnbalancedFeatures(threshold=0.85, verbose=True)
  >>> X_balanced = transformer.fit_transform(X_unbalanced)
  """
  def __init__(self, threshold=0.9, verbose=True):
    self.threshold = threshold
    self.verbose = verbose
    self.columns = None
    self.features_kept = []
    self.features_dropped = []

  def fit(self, X, y=None):
    if self.verbose:
      print(f"Removing unbalanced features with (threshold={self.threshold})")
    X = X.copy()
    if isinstance(X, pd.DataFrame):
      self.columns = list(X.columns)
    else:
      self.columns = None
    self.features_dropped = []
    for col_idx in range(X.shape[1]):
      counts = np.unique(X.iloc[:, col_idx] if isinstance(X, pd.DataFrame) else X[:, col_idx], return_counts=True)[1]
      percent = counts / counts.sum()

      if (percent > self.threshold).any():
          self.features_dropped.append(col_idx)
      else:
          self.features_kept.append(col_idx)
    if self.verbose:
      print("Parsing complete")
    return self

  def transform(self, X):
    if self.verbose:
      if len(self.features_dropped) > 0:
        print(f"Removed: {self.features_dropped}")
      else:
        print("No features removed")
    return X.iloc[:, self.features_kept] if isinstance(X, pd.DataFrame) else X[:, self.features_kept]

  def get_feature_names_out(self, input_features=None):
    if self.columns is not None:
      output_features = [self.columns[i] for i in self.features_kept]
      return output_features
    else:
      output_features = [input_features[i] for i in self.features_kept]
      return output_features

In [18]:
data.columns

Index(['record_id', 'risk_1_timestamp', 'age', 'sex', 'height', 'weight',
       'zipcode', 'ethnicity', 'income', 'education', 'prior_surg',
       'spin_surg', 'succ_surg', 'religion', 'odi_1', 'odi_2', 'odi_3',
       'odi_4', 'odi_5', 'odi_6', 'odi_7', 'odi_8', 'odi_9', 'odi_10',
       'exer_50improv_1drop', 'exer_50improv_10drop', 'exer_50improv_50drop',
       'exer_50improv_90drop', 'att_check_1', 'exer_90improv_1drop',
       'exer_90improv_10drop', 'exer_90improv_50drop', 'exer_90improv_90drop',
       'exer_50pain_1death', 'exer_50pain_10death', 'exer_50pain_50death',
       'exer_90pain_1death', 'exer_90pain_10death', 'exer_90pain_50death',
       'work_50improv_1drop', 'work_50improv_10drop', 'work_50improv_50drop',
       'work_50improv_90drop', 'work_90improv_1drop', 'work_90improv_10drop',
       'work_90improv_50drop', 'work_50improv_1para', 'work_50improv_10para',
       'work_50improv_50para', 'work_50improv_90para', 'work_90improv_1para',
       'work_90improv_10par

In [19]:
pat_res_risk = data[['exer_50improv_1drop', 'exer_50improv_10drop', 'exer_50improv_50drop',
       'exer_50improv_90drop', 'att_check_1', 'exer_90improv_1drop',
       'exer_90improv_10drop', 'exer_90improv_50drop', 'exer_90improv_90drop',
       'exer_50pain_1death', 'exer_50pain_10death', 'exer_50pain_50death',
       'exer_90pain_1death', 'exer_90pain_10death', 'exer_90pain_50death',
       'work_50improv_1drop', 'work_50improv_10drop', 'work_50improv_50drop',
       'work_50improv_90drop', 'work_90improv_1drop', 'work_90improv_10drop',
       'work_90improv_50drop', 'work_50improv_1para', 'work_50improv_10para',
       'work_50improv_50para', 'work_50improv_90para', 'work_90improv_1para',
       'work_90improv_10para', 'att_check2', 'work_90improv_50para',
       'work_50improv_1death', 'work_50improv_10death',
       'work_50improv_50death', 'work_90improv_1death',
       'work_90improv_10death', 'work_90improv_50death', 'spinal_risk_score']]
pat_res_risk = pat_res_risk.drop(['att_check_1','att_check2'], axis=1)

In [20]:
pat_res_risk

Unnamed: 0,exer_50improv_1drop,exer_50improv_10drop,exer_50improv_50drop,exer_50improv_90drop,exer_90improv_1drop,exer_90improv_10drop,exer_90improv_50drop,exer_90improv_90drop,exer_50pain_1death,exer_50pain_10death,exer_50pain_50death,exer_90pain_1death,exer_90pain_10death,exer_90pain_50death,work_50improv_1drop,work_50improv_10drop,work_50improv_50drop,work_50improv_90drop,work_90improv_1drop,work_90improv_10drop,work_90improv_50drop,work_50improv_1para,work_50improv_10para,work_50improv_50para,work_50improv_90para,work_90improv_1para,work_90improv_10para,work_90improv_50para,work_50improv_1death,work_50improv_10death,work_50improv_50death,work_90improv_1death,work_90improv_10death,work_90improv_50death,spinal_risk_score
0,4,4,3,0,5,5,4,4,4,1,0,5,4,0,5,5,4,4,5,5,4,4,3,0,0,5,5,0,5,3,0,5,5,0,0.843445
1,4,4,3,2,5,4,3,3,3,2,1,5,3,0,5,5,5,4,5,5,5,4,3,1,0,5,3,2,4,2,1,5,3,1,0.725933
2,3,2,0,0,4,3,0,0,0,0,0,0,0,0,4,4,2,1,5,4,1,1,0,0,0,2,1,0,1,0,0,1,0,0,0.512651
3,3,2,1,0,5,4,2,0,1,0,0,4,1,0,3,3,1,0,5,4,2,1,0,0,0,4,3,0,3,0,0,3,2,0,0.598028
4,5,4,1,0,5,5,3,0,5,4,1,5,4,1,5,4,1,0,5,5,1,5,4,1,0,5,4,1,5,4,1,5,4,1,0.881841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,2,1,0,0,4,3,1,0,0,0,0,0,0,0,4,2,1,0,4,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.466276
795,2,1,0,0,4,3,2,1,3,2,1,4,3,1,3,3,1,0,5,4,2,2,1,0,0,5,3,2,3,2,0,4,3,0,0.603798
796,1,1,1,0,3,2,1,1,0,0,0,2,1,0,3,2,1,0,3,2,1,1,0,0,0,1,1,0,1,0,0,1,0,0,0.445468
797,4,4,0,0,5,5,3,0,4,0,0,4,1,0,4,4,0,0,5,5,3,3,0,0,0,5,3,0,3,0,0,4,1,0,0.709183


In [21]:
data_final = data.drop(['odi_1', 'odi_2', 'odi_3',
       'odi_4', 'odi_5', 'odi_6', 'odi_7', 'odi_8', 'odi_9', 'odi_10',
       'exer_50improv_1drop', 'exer_50improv_10drop', 'exer_50improv_50drop',
       'exer_50improv_90drop', 'att_check_1', 'exer_90improv_1drop',
       'exer_90improv_10drop', 'exer_90improv_50drop', 'exer_90improv_90drop',
       'exer_50pain_1death', 'exer_50pain_10death', 'exer_50pain_50death',
       'exer_90pain_1death', 'exer_90pain_10death', 'exer_90pain_50death',
       'work_50improv_1drop', 'work_50improv_10drop', 'work_50improv_50drop',
       'work_50improv_90drop', 'work_90improv_1drop', 'work_90improv_10drop',
       'work_90improv_50drop', 'work_50improv_1para', 'work_50improv_10para',
       'work_50improv_50para', 'work_50improv_90para', 'work_90improv_1para',
       'work_90improv_10para', 'att_check2', 'work_90improv_50para',
       'work_50improv_1death', 'work_50improv_10death',
       'work_50improv_50death', 'work_90improv_1death',
       'work_90improv_10death', 'work_90improv_50death', 'att_pass',
       'risk_1_complete','height', 'weight','record_id', 'risk_1_timestamp', 
       'zipcode','age_range', 'postal_code','state_code','city',
       'province', 'province_code','latitude', 'longitude', 'FIPS', 'fips', 'GISJOIN', 'state'], axis=1)

In [24]:
pd.set_option('display.max_columns', None)
data_final['ADI_NATRANK'] = pd.to_numeric(data_final['ADI_NATRANK'], errors='coerce').astype(float).astype('Int64')
data_final['ADI_STATERNK'] = pd.to_numeric(data_final['ADI_STATERNK'], errors='coerce').astype(float).astype('Int64')

In [25]:
ohe_cols = ["religion", "ethnicity"]
cat_cols = ["sex", "income", "education", "prior_surg", "spin_surg", "succ_surg"]
num_cols = ["age", "odi_final", "bmi", "dospert_ethical", "dospert_financial", "dospert_health/safety", "dospert_recreational", "dospert_social", "height_m", "weight_kg", "ADI_NATRANK", "ADI_STATERNK"]

In [26]:
# define preprocessing pipeline
ohe_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
    ('selector', DropUnbalancedFeatures(threshold=0.8, verbose=False))
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('selector', DropUnbalancedFeatures(threshold=0.8, verbose=False))
])

num_pipe = Pipeline([
    ('imputer', IterativeImputer(random_state=52)),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', ohe_pipe, ohe_cols),
        ('cat', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
    ])

In [27]:
preprocessor.fit(data_final)  # Fit the ColumnTransformer to your data
transformed_columns = preprocessor.get_feature_names_out(input_features=data_final.columns)

In [28]:
transformed_columns

array(['ohe__religion_10', 'cat__sex', 'cat__income', 'cat__education',
       'cat__prior_surg', 'cat__succ_surg', 'num__age', 'num__odi_final',
       'num__bmi', 'num__dospert_ethical', 'num__dospert_financial',
       'num__dospert_health/safety', 'num__dospert_recreational',
       'num__dospert_social', 'num__height_m', 'num__weight_kg',
       'num__ADI_NATRANK', 'num__ADI_STATERNK'], dtype=object)

In [29]:
processed_final = preprocessor.fit_transform(data_final)

In [30]:
processed_final_df = pd.DataFrame(processed_final, columns=transformed_columns)

In [32]:
processed_final_df['spinal_risk_score'] = data_final['spinal_risk_score']

In [33]:
processed_final_df.to_csv('../data/ml_data_processed_final.csv', index=False)