In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
class CR_data_clean_up_egnine(BaseEstimator, TransformerMixin):
    def __init__(self):
        import os
        import pickle
        #os.chdir(files_path)
        #self.cat_columns = ['Gender','Region','Smokes','Diet','Alcoholic','Complexion']
        #self.num_columns = ['Age','No of Ciggarets per day','No of Hrs Sleep per Day','No of Hrs Exercise per Day','Height']
        self.imputer_pipe = pickle.load(open('patient_imputer_July22_2021.pkl','rb'))
        self.encoder_scaler_pipe = pickle.load(open('patient_scaler_encoder_July22_2021.pkl','rb'))
        self.sgd_clf = pickle.load(open('sgd_clf_june6.pkl','rb'))
        #self.grid_rf_clf = pickle.load(open('grid_rf_june6.pkl','rb'))
    
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        df = X.copy()
        df.drop(columns=["Sl.No","Patient #"],inplace=True)
        
        try:
            df.drop(columns='Cancer Diagnosis Result',inplace=True)
        except:
            None
        
        #'Age' clean-up -- drop the records which are not between 0 & 100
        age_index = df[~df["Age"].between(0,100)].index
        print ("dropping the age_index records ... ",age_index)
        print (df.iloc[age_index]["Age"].values)
        df.drop(age_index,inplace=True)

        # 'No of Hrs Exercise per Day' clean-up -- remove all characters in a float
        #and convert the series to float type
        df['No of Hrs Exercise per Day'] = df['No of Hrs Exercise per Day'].\
                                            replace(r'\D+', '', regex=True).astype(float)

        #'Region' clean-up -- drop the records which are not standard
        region_index = df[~df["Region"].isin(['SI','NE','NI'])].index
        print ("dropping the region_index records ... ",region_index)
        df.drop(region_index,inplace=True)
        
        features_categoric = df.select_dtypes("object").columns
        
        imp_cols = self.imputer_pipe.named_steps['pimpute'].transformers_[0][2].tolist()
        ccols = self.imputer_pipe.named_steps['pimpute'].transformers_[1][2]
        imp_cols.extend(ccols)
        imputed_unseen_set = self.imputer_pipe.transform(df)
        imputed_unseen_set = pd.DataFrame(imputed_unseen_set,columns=imp_cols).infer_objects()
        
        enc_cols = self.encoder_scaler_pipe.named_steps['ptrans'].transformers_[0][2].tolist()
        ohcols = self.encoder_scaler_pipe.named_steps['ptrans'].transformers_[1][1].\
            named_steps['onehot'].get_feature_names(features_categoric).tolist()
        enc_cols.extend(ohcols)
        
        es_unseen_set = self.encoder_scaler_pipe.transform(imputed_unseen_set)
        es_unseen_set = pd.DataFrame(es_unseen_set,columns=enc_cols).infer_objects()
        
        print ("Saving clean imputed & encoded data to ... prepared_unseen_cr_data.xlsx file ",es_unseen_set.shape )
        es_unseen_set.to_excel("prepared_unseen_cr_data.xlsx",index=False)
     
        return es_unseen_set

cr_cleanup = CR_data_clean_up_egnine()
cr_cleanup.transform(pd.read_excel('cr_unseen_data.xlsx'))

dropping the age_index records ...  Int64Index([15, 25, 52], dtype='int64')
[ nan  nan 520.]
dropping the region_index records ...  Int64Index([], dtype='int64')
Saving clean imputed & encoded data to ... prepared_unseen_cr_data.xlsx file  (52, 20)


Unnamed: 0,Age,No of Ciggarets per day,No of Hrs Sleep per Day,No of Hrs Exercise per Day,Height,Region_NE,Region_NI,Region_SI,Gender_Female,Gender_Male,Smokes_NO,Smokes_YES,Diet_NonVegetarian,Diet_Vegetarian,Alcoholic_Occasional,Alcoholic_Regular,Complexion_Brown,Complexion_Dark,Complexion_Fair,Complexion_Wheatish
0,-1.201452,-0.060443,-0.014331,0.045468,1.118834,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.447637,-0.060443,-0.729919,0.055361,0.22145,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.646293,0.641444,0.701258,-0.054361,1.03099,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.95018,0.407482,1.416846,-0.018387,0.738713,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,-1.452723,1.460313,-1.445508,0.054461,-1.723191,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,-0.615151,0.290501,-0.014331,-0.110122,0.997087,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
6,-1.368966,0.758426,-0.729919,-0.069651,0.61116,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
7,0.306178,0.524463,-0.014331,-0.183869,-0.210477,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
8,0.306178,-0.294405,-0.014331,20.671409,1.192941,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
9,0.054906,-0.294405,-1.445508,0.022084,-0.89887,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [6]:
import sklearn

In [9]:
print(sklearn.__version__)

0.23.2
