In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler


In [3]:
dataset = pd.read_csv('train.csv')
dataset.head()
dataset.drop(["PassengerId","Name"],axis=1,inplace=True)


In [5]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [47]:
def data_preprocessing_pipeline(data,fillnum:str="mean",scaler:str="standard",iqrconst:int=1.5,iqrquantiles:list=[0.25,0.75]):
    """Give a data to the pipeline, choose the args and watch the magic!
    WARNING: If there is an id, name or any kind of feature, which you dont think that they are important in the data, it is recommended to drop such columns before transferring to the function. 
    
    Args:
        data (_type_): Your data
        fillnum (str, optional): Which way to fill na values is numeric features: mean / median / mod. Defaults to "mean".
        scaler (str, optional): Which scaler method: standard / minmax . Defaults to "standard".
        iqrconst(int,optional): The constant of IQR while eliminating outliers. Defaults to 1.5.
        iqrquantiles(list,optional): The percentages for IQR. Defaults to [0.25,0.75]
    Returns:
        DataFrame: preprocessed data ready to be processed
    """
    #Identify numeric and categorical features
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
    categorical_features = data.select_dtypes(include=['object']).columns

    #Handle missing values in numeric features
    def nummean(datas):
        return datas.mean()
    def nummedian(datas):
        return datas.median()
    def nummod(datas):
        return datas.mode()
    if fillnum == "mean":
        fillnumfunc = nummean
    elif fillnum == "median":
        fillnumfunc = nummedian
    elif fillnum == "mode":
        fillnumfunc = nummod
    
    data[numeric_features] = data[numeric_features].fillna(fillnumfunc(data[numeric_features]))

    #Detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1 = data[feature].quantile(iqrquantiles[0])
        Q3 = data[feature].quantile(iqrquantiles[1])
        IQR = Q3 - Q1
        lower_bound = Q1 - (iqrconst * IQR)
        upper_bound = Q3 + (iqrconst * IQR)
        data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
                                 data[feature].mean(), data[feature])

    #Normalize numeric features
    
    if scaler == "standard":
        scaler = StandardScaler()
    elif scaler =="minmax":
        scaler = MinMaxScaler()
        
    scaled_data = scaler.fit_transform(data[numeric_features])
    data[numeric_features] = scaler.transform(data[numeric_features])

    #Handle missing values in categorical features
    data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

    return data

In [53]:
processed_data = data_preprocessing_pipeline(dataset,"median","standard",2,[0.30,0.80])

In [55]:
processed_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,-0.789272,0.827377,male,-1.101969,1.302222,-0.560499,A/5 21171,-0.918474,B96 B98,S
1,1.266990,-1.566107,female,-0.390370,1.302222,-0.560499,PC 17599,0.923286,C85,C
2,1.266990,0.827377,female,0.005683,-0.625340,-0.560499,STON/O2. 3101282,-0.868656,B96 B98,S
3,1.266990,-1.566107,female,-1.101969,1.302222,-0.560499,113803,2.465512,C123,S
4,-0.789272,0.827377,male,-1.101969,-0.625340,-0.560499,373450,-0.859430,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...
886,-0.789272,-0.369365,male,-2.534843,-0.625340,-0.560499,211536,-0.494092,B96 B98,S
887,1.266990,-1.566107,female,-1.101969,-0.625340,-0.560499,112053,0.760603,B42,S
888,-0.789272,0.827377,female,1.083580,1.302222,1.784124,W./C. 6607,0.277176,B96 B98,S
889,1.266990,-1.566107,male,0.005683,-0.625340,-0.560499,111369,0.760603,C148,C
