## Powerful Python Functions for Data Science

Created all the necessary data science functions in a class called variable treatment (VT)

1. drop_nan_col: Drop columns which has more missing rows using the threshold operator
2. drop_zero_var_col (for numeric): Drop all the columns with 0 variance
3. drop_zero_car_col (for categorical): Drops categorical columns with same levels, such as a column with all 'yes' values
4. drop_high_levels (for categorical): This function will eliminate all the columns has a lot of levels based on threshold
5. replace_missing
    1. num_value: User decides with what values they want to replace the missing numerical values.This value can be mean median                   mode or zero
    2. cat_val: User decides with what values they want to replace the missing numerical values. This value can be mode or                       'unknown'
6. encode_target: Encodes the class label if class column is categorical. If class column is numerical just return the same                       dataframe without doing anything.Do not forget that clas label might have more than 2 levels (yes and no is                     two levels). Target levels can be agree, stringly agree, disagree strongly disagree, neutral (5 levels)                
7. transform: Transforms numerical values in a way that it will increase model accuracy. 
              Transformations: {'asis','log','exp','sqrt','pow2'}
8. create_dummies: Creates dummy variables for categorical variables

In [93]:
class variableTreatment():
    
    def drop_nan_col(self, df, threshold): 
        """
        Objective: Drops columns most of whose rows missing
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. threshold: Determines which columns will be dropped.
                      if threshold is .9, the columns with 90% missing value will be dropped
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """
        nans = []
        for i in df.columns:
            if (float(df[i].isnull().sum())/df[i].shape[0]) > threshold:
                nans.append(i)
                
        df.drop(nans, axis = 1, inplace=True)
        
        return df
    
    
    def drop_zero_var_col(self, df):
        """
        Objective: Drops numerical columns with zero variance
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """
        
        zeros = []
        for i in df.select_dtypes(include = ['float64', 'float32', 'int']).columns:
            if np.std(df[i]) == 0:
                zeros.append(i)
        df.drop(zeros, axis = 1, inplace = True)
        
        return df
     
        
        
    def drop_zero_car_col(self, df):
        """
        Objective: Drops categorical columns with same levels, such as a column with all 'yes' values
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """
        
        car_zero = []
        
        for i in df.select_dtypes(include = ['object']).columns:
            if len(df[i].unique()) < 2:
                car_zero.append(i)
                
        df.drop(car_zero, axis = 1, inplace = True)
        return df
        
        
        
    def drop_high_levels(self, df, threshold):
        """
        this task will eliminate categorical columns if this column has a lot of levels. 
        inputs:
        1. Dataframe df: Pandas dataframe
        2. Threshold: How many levels you want at most
        
        outputs:
        1. Dataframe df: updated dataframe without dropped columns
        
        """
        
        lev_counts = []
        if df.select_dtypes(include = ['object']).columns.tolist() != []:
            for i in df.select_dtypes(include = ['object']).columns:
                if len((df[i].value_counts().tolist())) > threshold:
                    lev_counts.append(i)
            df.drop(lev_counts, axis = 1, inplace = True)
            
        return df
    
    
    def replace_missing(self, df, num_val):
        """
        Objective: Replaces missing values with given values
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. num_val: User decides with what values they want to replace the missing numerical values. 
                    This value can be mean median mode or zero
        3. cat_val: User decides with what values they want to replace the missing numerical values. 
                    This value can be mode or 'unknown'
        
        
        Outputs:
        1. Dataframe df with imputed missing values
        """
        if num_val == 'mode':
            df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()] = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].fillna(value = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].mode())
        
        elif num_val == 'mean':
            df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()] = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].fillna(value = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].mean())
        
        elif num_val == 'median':
            df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()] = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].fillna(value = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].median())
        else:
            df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()] = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].fillna(value = 0, inplace = True)
        
        df[df.select_dtypes(include = ['object']).columns.tolist()] = df[df.select_dtypes(include = ['object']).columns.tolist()].fillna(value = 'unknown')
        
        return df
    
    
    
    
    def encode_target(self, df, target_name):
        """
        Objective: Encodes the class label if class column is categorical.
                   If class column is numerical just return the same dataframe without doing anything
                   Do not forget that clas label might have more than 2 levels (yes and no is two levels)
                   Target levels can be agree, stringly agree, disagree strongly disagree, neutral (5 levels)
                   Do not hard code.
                   
        Inputs: 
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with encoded binary class labels. 
        """
        
        if df[target_name].dtype == 'object':
            target_levels_cat = df[target_name].value_counts().index.tolist()
            target_levels_num = []
            for i in range(0, len(target_levels_cat)):
                target_levels_num.append(i)
                target_levels = pd.DataFrame([target_levels_cat,target_levels_num ]).T
            target_levels.columns = ['target_level_cat', 'target_level_num']
            
        if df[target_name].dtype == 'object':
            for i in range(0, target_levels.shape[0]):
                df.loc[df[target_name] == target_levels['target_level_cat'][i], target_name] = target_levels['target_level_num'][i]
            
            df[target_name] = df[target_name].astype(float)
            
        return df
    
    
    
    def transform(self, df, label_name):
        """
        Objective: Transforms numerical values in a way that it will increase model accuracy.
        
        inputs:
        1. Dataframe df: Pandas dataframe 
        
         outputs:
        1. Dataframe df with transformed values
        """
        
        for i in list(set(df.select_dtypes(include = ['float64', 'float32', 'int']).columns) - set([label_name])):
            corr = {'asis':0, 'log':0, 'exp':0, 'sqrt':0, 'pow2':0}
            
            corr['asis'] = abs(np.corrcoef(df[i], df[label_name])[1][0])
            
            if all((df[i]>=0)):
                corr['log'] = abs(np.corrcoef(np.log(df[i] + 0.00001), df[label_name])[1][0])
            else:
                corr['log'] = 0  
                
            corr['exp'] = abs(np.corrcoef(np.exp(df[i].subtract(df[i].mean())/df[i].std()), df[label_name])[1][0])
            
            if all((df[i] >= 0)):
                corr['sqrt'] = abs(np.corrcoef(np.sqrt(df[i] + 0.00001), df[label_name])[1][0])
            else:
                corr['sqrt'] = 0
                
            corr['pow2'] = abs(np.corrcoef(np.power(df[i].subtract(df[i].mean())/df[i].std(), 2), df[label_name])[1][0])
            
            if max(corr, key=corr.get) == 'asis':
                df[i] = df[i]
            elif max(corr, key=corr.get) == 'log':
                df[i] = np.log(df[i] + 0.00001)
            elif max(corr, key=corr.get) == 'exp':
                df[i] = np.exp(df[i].subtract(df[i].mean())/df[i].std())
            elif max(corr, key=corr.get) == 'sqrt':
                df[i] = np.sqrt(df[i] + 0.00001)
            else:
                df[i] = np.power(df[i].subtract(df[i].mean())/df[i].std(), 2)
                
        return df
                    


    
    
    def create_dummies(self, df, label_name):
        """
        Objective: Creates dummy variables for categorical variables
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with dummy variables
        """
        
        
        cat_input_var = list(df.select_dtypes(include=['object']).columns)
        cat_input_var = list(set(cat_input_var) - set([label_name]))
        
        if label_name in cat_input_var:
            cat_input_var.remove(label_name)
        
        if cat_input_var != []: 
            dummy_cat_df = pd.get_dummies(df[cat_input_var], drop_first=True)
            df = df.drop(cat_input_var, axis = 1)
            
        df = pd.concat([df, dummy_cat_df], axis = 1)
        
        return df
        

In [94]:
import pandas as pd
import numpy as np
df = pd.read_excel('CKD.xlsx')

In [95]:
df[:1]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,appet,pe,ane,class,yls,ccm,mls,lev_a,lev_b,l_x
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,good,no,no,ckd,1.0,2.0,3,a1,b1,d


In [96]:
# create and instance from the class variableTreatment
VT = variableTreatment()

In [97]:
VT

<__main__.variableTreatment at 0x2862ce10550>

In [99]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,appet,pe,ane,class,yls,ccm,mls,lev_a,lev_b,l_x
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,good,no,no,ckd,1.0,2.0,3,a1,b1,d
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,118.0,...,good,no,no,ckd,5.0,,3,a2,b2,d
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,poor,no,yes,ckd,,,3,a3,b3,d
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,poor,yes,yes,ckd,,,3,a4,b4,d
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,good,no,no,ckd,,3.0,3,a5,b5,d


In [101]:
VT.drop_nan_col(df, 0.1).head()

Unnamed: 0,age,bp,pcc,ba,bgr,bu,sc,htn,dm,cad,appet,pe,ane,class,mls,lev_a,lev_b,l_x
0,48.0,80.0,notpresent,notpresent,121.0,36.0,1.2,yes,yes,no,good,no,no,ckd,3,a1,b1,d
1,7.0,50.0,notpresent,notpresent,118.0,18.0,0.8,no,no,no,good,no,no,ckd,3,a2,b2,d
2,62.0,80.0,notpresent,notpresent,423.0,53.0,1.8,no,yes,no,poor,no,yes,ckd,3,a3,b3,d
3,48.0,70.0,present,notpresent,117.0,56.0,3.8,yes,no,no,poor,yes,yes,ckd,3,a4,b4,d
4,51.0,80.0,notpresent,notpresent,106.0,26.0,1.4,no,no,no,good,no,no,ckd,3,a5,b5,d


In [103]:
df.shape

(387, 18)

In [104]:
VT.drop_zero_var_col(df).head()

Unnamed: 0,age,bp,pcc,ba,bgr,bu,sc,htn,dm,cad,appet,pe,ane,class,mls,lev_a,lev_b,l_x
0,48.0,80.0,notpresent,notpresent,121.0,36.0,1.2,yes,yes,no,good,no,no,ckd,3,a1,b1,d
1,7.0,50.0,notpresent,notpresent,118.0,18.0,0.8,no,no,no,good,no,no,ckd,3,a2,b2,d
2,62.0,80.0,notpresent,notpresent,423.0,53.0,1.8,no,yes,no,poor,no,yes,ckd,3,a3,b3,d
3,48.0,70.0,present,notpresent,117.0,56.0,3.8,yes,no,no,poor,yes,yes,ckd,3,a4,b4,d
4,51.0,80.0,notpresent,notpresent,106.0,26.0,1.4,no,no,no,good,no,no,ckd,3,a5,b5,d


In [105]:
df.shape

(387, 18)

In [106]:
VT.drop_zero_car_col(df).head()

Unnamed: 0,age,bp,pcc,ba,bgr,bu,sc,htn,dm,cad,appet,pe,ane,class,mls,lev_a,lev_b
0,48.0,80.0,notpresent,notpresent,121.0,36.0,1.2,yes,yes,no,good,no,no,ckd,3,a1,b1
1,7.0,50.0,notpresent,notpresent,118.0,18.0,0.8,no,no,no,good,no,no,ckd,3,a2,b2
2,62.0,80.0,notpresent,notpresent,423.0,53.0,1.8,no,yes,no,poor,no,yes,ckd,3,a3,b3
3,48.0,70.0,present,notpresent,117.0,56.0,3.8,yes,no,no,poor,yes,yes,ckd,3,a4,b4
4,51.0,80.0,notpresent,notpresent,106.0,26.0,1.4,no,no,no,good,no,no,ckd,3,a5,b5


In [107]:
df.shape

(387, 17)

In [108]:
VT.drop_high_levels(df, 100).head()

Unnamed: 0,age,bp,pcc,ba,bgr,bu,sc,htn,dm,cad,appet,pe,ane,class,mls
0,48.0,80.0,notpresent,notpresent,121.0,36.0,1.2,yes,yes,no,good,no,no,ckd,3
1,7.0,50.0,notpresent,notpresent,118.0,18.0,0.8,no,no,no,good,no,no,ckd,3
2,62.0,80.0,notpresent,notpresent,423.0,53.0,1.8,no,yes,no,poor,no,yes,ckd,3
3,48.0,70.0,present,notpresent,117.0,56.0,3.8,yes,no,no,poor,yes,yes,ckd,3
4,51.0,80.0,notpresent,notpresent,106.0,26.0,1.4,no,no,no,good,no,no,ckd,3


In [109]:
df.shape

(387, 15)

In [111]:
VT.replace_missing(df, 'median').head()

Unnamed: 0,age,bp,pcc,ba,bgr,bu,sc,htn,dm,cad,appet,pe,ane,class,mls
0,48.0,80.0,notpresent,notpresent,121.0,36.0,1.2,yes,yes,no,good,no,no,ckd,3
1,7.0,50.0,notpresent,notpresent,118.0,18.0,0.8,no,no,no,good,no,no,ckd,3
2,62.0,80.0,notpresent,notpresent,423.0,53.0,1.8,no,yes,no,poor,no,yes,ckd,3
3,48.0,70.0,present,notpresent,117.0,56.0,3.8,yes,no,no,poor,yes,yes,ckd,3
4,51.0,80.0,notpresent,notpresent,106.0,26.0,1.4,no,no,no,good,no,no,ckd,3


In [112]:
VT.encode_target(df, 'class').head()

Unnamed: 0,age,bp,pcc,ba,bgr,bu,sc,htn,dm,cad,appet,pe,ane,class,mls
0,48.0,80.0,notpresent,notpresent,121.0,36.0,1.2,yes,yes,no,good,no,no,0.0,3
1,7.0,50.0,notpresent,notpresent,118.0,18.0,0.8,no,no,no,good,no,no,0.0,3
2,62.0,80.0,notpresent,notpresent,423.0,53.0,1.8,no,yes,no,poor,no,yes,0.0,3
3,48.0,70.0,present,notpresent,117.0,56.0,3.8,yes,no,no,poor,yes,yes,0.0,3
4,51.0,80.0,notpresent,notpresent,106.0,26.0,1.4,no,no,no,good,no,no,0.0,3


In [67]:
VT.transform(df, 'class').head()

Unnamed: 0,age,bp,pcc,ba,bgr,bu,sc,htn,dm,cad,appet,pe,ane,class,mls
0,48.0,80.0,notpresent,notpresent,4.795791,6.000001,0.18233,yes,yes,no,good,no,no,0.0,3
1,7.0,50.0,notpresent,notpresent,4.770685,4.242642,-0.223131,no,no,no,good,no,no,0.0,3
2,62.0,80.0,notpresent,notpresent,6.047372,7.280111,0.587792,no,yes,no,poor,no,yes,0.0,3
3,48.0,70.0,present,notpresent,4.762174,7.483315,1.335004,yes,no,no,poor,yes,yes,0.0,3
4,51.0,80.0,notpresent,notpresent,4.663439,5.09902,0.336479,no,no,no,good,no,no,0.0,3


In [115]:
df.shape

(387, 15)

In [113]:
VT.create_dummies(df, 'class').head()

Unnamed: 0,age,bp,bgr,bu,sc,class,mls,dm_unknown,dm_yes,pe_no,...,htn_yes,ba_present,ba_unknown,appet_no,appet_poor,appet_unknown,cad_unknown,cad_yes,pcc_present,pcc_unknown
0,48.0,80.0,121.0,36.0,1.2,0.0,3,0,1,1,...,1,0,0,0,0,0,0,0,0,0
1,7.0,50.0,118.0,18.0,0.8,0.0,3,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,62.0,80.0,423.0,53.0,1.8,0.0,3,0,1,1,...,0,0,0,0,1,0,0,0,0,0
3,48.0,70.0,117.0,56.0,3.8,0.0,3,0,0,0,...,1,0,0,0,1,0,0,0,1,0
4,51.0,80.0,106.0,26.0,1.4,0.0,3,0,0,1,...,0,0,0,0,0,0,0,0,0,0
