# preprocessing

In [1]:
from scipy.io.arff import loadarff
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
le = preprocessing.LabelEncoder()

In [2]:
data, meta = loadarff('heart-c.arff')
data = pd.DataFrame(data)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,b'male',b'typ_angina',145.0,233.0,b't',b'left_vent_hyper',150.0,b'no',2.3,b'down',0.0,b'fixed_defect',"b""'<50'"""
1,67.0,b'male',b'asympt',160.0,286.0,b'f',b'left_vent_hyper',108.0,b'yes',1.5,b'flat',3.0,b'normal',"b""'>50_1'"""
2,67.0,b'male',b'asympt',120.0,229.0,b'f',b'left_vent_hyper',129.0,b'yes',2.6,b'flat',2.0,b'reversable_defect',"b""'>50_1'"""
3,37.0,b'male',b'non_anginal',130.0,250.0,b'f',b'normal',187.0,b'no',3.5,b'down',0.0,b'normal',"b""'<50'"""
4,41.0,b'female',b'atyp_angina',130.0,204.0,b'f',b'left_vent_hyper',172.0,b'no',1.4,b'up',0.0,b'normal',"b""'<50'"""


# missing values (methods: knn, avg, new category, mice)

### find out the index of the samples with missing values 

In [73]:
inds = np.asarray(data.isnull()).nonzero()
data.iloc[inds[0]]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
166,52.0,b'male',b'non_anginal',138.0,223.0,b'f',b'normal',169.0,b'no',0.0,b'up',,b'normal',"b""'<50'"""
191,43.0,b'male',b'asympt',132.0,247.0,b't',b'left_vent_hyper',143.0,b'yes',0.1,b'flat',,b'reversable_defect',"b""'>50_1'"""
286,58.0,b'male',b'atyp_angina',125.0,220.0,b'f',b'normal',144.0,b'no',0.4,b'flat',,b'reversable_defect',"b""'<50'"""
301,38.0,b'male',b'non_anginal',138.0,175.0,b'f',b'normal',173.0,b'no',0.0,b'up',,b'normal',"b""'<50'"""
302,38.0,b'male',b'non_anginal',138.0,175.0,b'f',b'normal',173.0,b'no',0.0,b'up',,b'normal',"b""'<50'"""


In [37]:
def min_max_scale(X):
    X_scaled = (X-X.min())/(X.max()-X.min())
    return X_scaled
def normalisation(X):
    X_normalised = (X-X.mean())/X.std(axis=0)
    return X_normalised

In [62]:
def imputation_scale(data,imp_method,scale_method):
    data_y = le.fit_transform(data['num'])
    data_X = data[data.columns[:].drop(['num'])]
    cols = data_X.columns

    nom = [i for i in cols if type(data_X[i][0]) == bytes]
    num = [i for i in cols if type(data_X[i][0]) == np.float64]
    
    inds = np.asarray(data_X.isnull()).nonzero()
    var_name = cols[inds[1][0]]
    
    X = pd.concat([data_X[nom].apply(le.fit_transform),data_X[num]],axis=1)

    if imp_method == 'as_category': data_X_complete = X.fillna(4)
    if imp_method == 'replace_mean': data_X_complete = X.fillna(data_X[var_name].mean())
    if imp_method == 'replace_med': data_X_complete = X.fillna(data_X[var_name].median())
    if imp_method == 'MICE':
        Mice = MiceImputer()
        data_X_complete = Mice.fit_transform(X)
        
    if imp_method == 'knn_1': data_X_complete, data_y = imputation_knn(X,data_y,inds,var_name,1,scale_method)
    if imp_method == 'knn_3': data_X_complete, data_y = imputation_knn(X,data_y,inds,var_name,3,scale_method)
    
    if scale_method == 'min_max': data_X_complete = min_max_scale(data_X_complete)
    if scale_method == 'normalise': data_X_complete = normalisation(data_X_complete)
        
    return data_X_complete, data_y

## KNN (k=1,3)

In [66]:
def imputation_knn(X,data_y,inds,var_name,k,scale_method):
    X_drop = X[X.columns.drop(var_name)]
    if scale_method == 'min_max': X_scaled = min_max_scale(X_drop)
    if scale_method == 'normalise': X_sclaed = normalisation(X_drop)

    y = X[var_name].drop(inds[0])
    X1 = X_scaled.drop(inds[0],axis=0)
    X2 = X_scaled.iloc[inds[0]]
    
    clf = KNeighborsClassifier(k)
    clf.fit(X1,y)
    y_missing = clf.predict(X2)
    
    y_new = pd.DataFrame({var_name:np.append(np.asarray(y),y_missing)})
    
    data_X_complete = pd.concat([X.drop(inds[0],axis=0),X.iloc[inds[0]]]).reset_index(drop=True)
    data_X_complete.update(y_new)
    
    data_y = np.append(np.delete(data_y,inds[0]),data_y[inds[0]])

    return data_X_complete, data_y

## MICE

In [67]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import Imputer
import numpy as np
import pandas as pd

class MiceImputer:
    
    model_dict_ = {}
    
    def __init__(self, seed_nulls=False, seed_strategy='mean'):
        self.seed_nulls = seed_nulls
        self.seed_strategy = seed_strategy
        
    
    def transform(self, X):
        col_order = X.columns
        new_X = []
        mutate_cols = list(self.model_dict_.keys())
        
        for i in mutate_cols:
            y = X[i]
            x_null = X[y.isnull()]
            y_null = y[y.isnull()].reset_index()['index']
            y_notnull = y[y.notnull()]
            
            model = self.model_dict_.get(i)
            
            if self.seed_nulls:
                x_null = model[1].transform(x_null)
            else:
                null_check = x_null.isnull().any()
                x_null = x_null[null_check.index[~null_check]]
            
            pred = pd.concat([pd.Series(model[0].predict(x_null))\
                              .to_frame()\
                              .set_index(y_null),y_notnull], axis=0)\
                              .rename(columns={0: i})
            
            new_X.append(pred)

        new_X.append(X[X.columns.difference(mutate_cols)])

        final = pd.concat(new_X, axis=1)[col_order]

        return final
        
        
    def fit(self, X):      
        x = X.fillna(value=np.nan)

        null_check = x.isnull().any()
        null_data = x[null_check.index[null_check]]
        
        for i in null_data:
            y = null_data[i]
            y_notnull = y[y.notnull()]

            model_list = []
            if self.seed_nulls:
                imp = Imputer(strategy=self.seed_strategy)
                model_list.append(imp.fit(x))
                non_null_data = pd.DataFrame(imp.fit_transform(x))
                
            else:
                non_null_data = x[null_check.index[~null_check]]
                
            
            x_notnull = non_null_data[y.notnull()]
            
            if y.nunique() > 2:
                model = LinearRegression()
                model.fit(x_notnull, y_notnull)
                model_list.insert(0, model)
                self.model_dict_.update({i: model_list})
            else:
                model = LogisticRegression()
                model.fit(x_notnull, y_notnull)
                model_list.insert(0, model)
                self.model_dict_.update({i: model_list})

        return self
        

    def fit_transform(self, X):
        return self.fit(X).transform(X)

# test

In [68]:
X, y = imputation_scale(data,'knn_1','min_max')

In [71]:
X.head()

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,thal,age,trestbps,chol,thalach,oldpeak,ca
0,1.0,1.0,1.0,0.0,0.0,0.0,0.333333,0.708333,0.481132,0.244292,0.603053,0.370968,0.0
1,1.0,0.0,0.0,0.0,1.0,0.5,0.666667,0.791667,0.622642,0.365297,0.282443,0.241935,1.0
2,1.0,0.0,0.0,0.0,1.0,0.5,1.0,0.791667,0.245283,0.23516,0.442748,0.419355,0.666667
3,1.0,0.666667,0.0,0.5,0.0,0.0,0.666667,0.166667,0.339623,0.283105,0.885496,0.564516,0.0
4,0.0,0.333333,0.0,0.0,0.0,1.0,0.666667,0.25,0.339623,0.178082,0.770992,0.225806,0.0


In [74]:
y

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0,