In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
def data_cleaning(df, target='last'):
    df.reset_index(inplace = True, drop = True)
    df.drop_duplicates(subset=None,keep='first',inplace=True)
    if target=='last':
        df.dropna(subset=[df.columns[-1]],inplace=True)
    else:
        df.dropna(subset=[target],inplace=True)
    df=df.replace('[~!@#$%^&*()_+/]','',regex = True)
    return df

In [3]:
def Null_checking(df):
    null_df=pd.DataFrame({col:[df[col].isnull().sum(),df[col].isnull().sum()*100/len(df)] for col in df.columns},index=['null_count','null_percentage'])
    return null_df

In [4]:
def Excessive_Null(df,percent_limit=70):
    for col in df.columns:
        if df[col].isnull().sum() > percent_limit*len(df)/100 :
            df.drop(col,axis=1,inplace=True)
    return df

In [5]:
def uniqueness_checking(df):
    unique_df=pd.DataFrame({col:[df[col].unique().shape[0],df[col].unique().shape[0]*100/len(df), df[col].dtypes] for col in df.columns},index=['unique_count','uniqueness_percentage', 'dtype'])
    return unique_df

In [6]:
def severe_unique_int_obj_drop(df,percent_limit=99):
    for col in df.columns:
        if df[col].dtype != 'float64' and df[col].unique().shape[0] > percent_limit*len(df)/100 :
             df.drop(col,axis=1,inplace=True)
    return df

In [7]:
def drop_cols(df, cols=[]):
    for col in cols:
        df.drop(col, axis=1, inplace=True)
    return df

In [8]:
def Null_filling_num(df,fill_by='mean'):
    for col in df.columns:
        if df[col].dtypes!= 'O' and df[col].isnull().sum()>0:
           if fill_by == 'mean':
              df[col].fillna(df[col].mean(),inplace=True)
           elif fill_by == 'median':
              df[col].fillna(df[col].median(),inplace=True)
           elif fill_by == 'mode':
              df[col].fillna(df[col].mode()[0],inplace=True)
           elif fill_by == 'interpolate':
              df[col].fillna(df[col].interpolate(),inplace=True)
    return df

In [9]:
def Null_filling_cat(df,fill_by='mode'):
    for col in df.columns:
        if df[col].dtypes == 'O' and df[col].isnull().sum()>0:
           if fill_by == 'mode':
              df[col].fillna(df[col].mode()[0],inplace=True)
           elif fill_by == 'ffill':
              df[col].fillna(method = "ffill", inplace = True)
           elif fill_by == 'bfill':
              df[col].fillna(method = "bfill", inplace = True)
    return df

In [10]:
def skew_handling(df, trans= 'yeojohnson'):
    for col in df.columns:
        if df[col].dtype!='O' and abs(df[col].skew())>1:
           flag = 0
           if trans == 'sqrt':
                for elm in df[col].unique():
                    if elm < 0:
                        flag=1
                        break
                if flag==0:
                   df[col]=np.sqrt(df[col]) 
                
           elif trans == 'log':
                for elm in df[col].unique():
                        if elm <= 0:
                            flag=1
                            break
                if flag==0:
                   df[col]=np.log(df[col])
                    
           elif trans == 'boxcox':
                for elm in df[col].unique():
                        if elm <= 0:
                            flag=1
                            break
                if flag==0:
                    df[col], param = stats.boxcox(df[col]) 
                    
           if trans == 'yeojohnson' or flag == 1:
              df[col], param = stats.yeojohnson(df[col].astype(float))
                    
           
    return df

In [11]:
def remove_outliers_interquartile(qu_dataset, qu_field, qu_fence='inner'):
    a = qu_dataset[qu_field].describe()
    iqr = a["75%"] - a["25%"]
    
    if qu_fence == "inner":
        upper_inner_fence = a["75%"] + 1.5 * iqr
        lower_inner_fence = a["25%"] - 1.5 * iqr
        output_dataset = qu_dataset[qu_dataset[qu_field]<=upper_inner_fence]
        output_dataset = output_dataset[output_dataset[qu_field]>=lower_inner_fence]
     
    elif qu_fence == "outer":
        upper_outer_fence = a["75%"] + 3 * iqr
        lower_outer_fence = a["25%"] - 3 * iqr
        output_dataset = qu_dataset[qu_dataset[qu_field]<=upper_outer_fence]
        output_dataset = output_dataset[output_dataset[qu_field]>=lower_outer_fence]
    
    percent_removed=100*(len(qu_dataset)-len(output_dataset))/len(qu_dataset)
    outlier_dict = {'initial_length':len(qu_dataset),'updated_length':len(output_dataset),'percent_removed(%)':percent_removed}
    
    return outlier_dict,output_dataset

In [12]:
#turn all the columns numerical
#by creating dummies

In [13]:
def create_dummies(data):
    for col in data.columns:
        if data[col].dtype=='O': 
            catgs=data[col].unique()
            occurs=np.array(data[col].value_counts())
            cumsum_occurs=np.cumsum(occurs)
        
            for i in range(len(cumsum_occurs)):
                if cumsum_occurs[i]>=.9*data.shape[0]:
                    p=i+1
                    break
                    
            top_catgs=catgs[:p]       
            data[col].replace(to_replace =catgs[p:], 
                            value = catgs[0], inplace=True)
            
            mydummy = pd.get_dummies(data[col], prefix=col,drop_first=True)
            data=pd.concat([data,mydummy],axis=1)
            data.drop(columns = col,inplace=True)
            
    return data

In [14]:
def standard_scaling(x):
    scaler =StandardScaler()
    x_scaled = scaler.fit_transform(x)
    return x_scaled

def normal_scaling(x):
    scaler =MinMaxScaler()
    x_scaled = scaler.fit_transform(x)
    return x_scaled

def scaling(x):
    l=[abs(x[col].skew()) for col in x.columns]
    flag=0
    for elm in l:
        if elm>2:
            flag=1
            break
    if flag==0:
        return standard_scaling(x)
    else:
        return normal_scaling(x)

In [15]:
def vif_checking(dfx):
    X_scaled=standard_scaling(dfx) #standardization is required before checking vif
    vif_dict = {dfx.columns[i]:variance_inflation_factor(X_scaled,i) for i in range(dfx.shape[1])}
    return vif_dict

def vif_handling(dfx): 
    flag=0
    vif_dict = vif_checking(dfx)
    if max(vif_dict.values()) >5:
        flag=1
    while flag == 1:
        vif_dict = vif_checking(dfx)
        M=max(vif_dict.values())    
        for key in vif_dict.keys():
            if vif_dict[key]==M:
                 dfx.drop(columns=[key],inplace=True)
                 flag = 0
                 break
        
        if max(vif_checking(dfx).values()) >5:
           flag=1
        
    
    return dfx

In [16]:
def pca_dim_reduc(dfx):
    dfx_scaled=standard_scaling(dfx)
    df=pd.DataFrame(data=dfx_scaled, columns= dfx.columns)
    pca = PCA()
    pca.fit_transform(df)
    pca.explained_variance_ratio_
    arr=np.cumsum(pca.explained_variance_ratio_)
    for i in range(len(arr)):
        if arr[i]>=.87:
            p=i+1
            break
        
    pca = PCA(n_components=p)
    new_data = pca.fit_transform(df)
    cols=['pc'+str(i+1) for i in range(p)]
    principal_Df = pd.DataFrame(data = new_data, columns=cols)
    
    return principal_Df

##### Running the functions for EDA and Multicollinearity and Dim Reduction

In [17]:
#classification problem
data=pd.read_csv('titanic.csv')
df = data.copy()

In [18]:
df=data_cleaning(df, target='Survived')
Null_checking(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
null_count,0.0,0.0,0.0,0.0,0.0,177.0,0.0,0.0,0.0,0.0,687.0,2.0
null_percentage,0.0,0.0,0.0,0.0,0.0,19.86532,0.0,0.0,0.0,0.0,77.104377,0.224467


In [19]:
df=Excessive_Null(df,percent_limit=70)
uniqueness_checking(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
unique_count,891,2,3,891,2,89,7,7,681,248,4
uniqueness_percentage,100.0,0.224467,0.3367,100.0,0.224467,9.988777,0.785634,0.785634,76.430976,27.833895,0.448934
dtype,int64,int64,int64,object,object,float64,int64,int64,object,float64,object


In [20]:
df=severe_unique_int_obj_drop(df,percent_limit=99)
drop_cols(df, cols=['Ticket'])

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,0,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [22]:
df=Null_filling_num(df,fill_by='mean')
df=Null_filling_cat(df,fill_by='mode')
Null_checking(df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
null_count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
null_percentage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df=skew_handling(df, trans= 'yeojohnson')

In [24]:
df=create_dummies(df)

In [25]:
#separate target column
x=df.drop(columns=['Survived'],axis=1)
y=df['Survived']

In [26]:
x_scaled=scaling(x)
vif_checking(x)

{'Pclass': 2.085701412082225,
 'Age': 1.2410361603270403,
 'SibSp': 1.4186621180073071,
 'Parch': 1.4672596714730628,
 'Fare': 2.493769176225968,
 'Sex_male': 1.1154329031445414,
 'Embarked_S': 1.0895919916176477}

In [27]:
x_updated=vif_handling(x)
x_pca=pca_dim_reduc(x)

##### Model Building and testing Purpose

In [28]:
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split

In [29]:
regression=LogisticRegression()

In [30]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=357)
regression.fit(x_train,y_train)
regression.score(x_train,y_train)

0.7844311377245509

In [31]:
regression.score(x_test,y_test)

0.8026905829596412

In [32]:
x_train,x_test,y_train,y_test=train_test_split(x_pca,y,test_size=0.25,random_state=357)
regression.fit(x_train,y_train)
regression.score(x_train,y_train)

0.7709580838323353

In [33]:
regression.score(x_test,y_test)

0.7802690582959642

In [34]:
#Regression problem
from sklearn import datasets
boston = datasets.load_boston()
X = pd.DataFrame(boston.data,columns=boston.feature_names)
Y = boston.target
data=pd.concat([X,pd.DataFrame(Y,columns=['Price'])],axis=1)
df = data.copy()

In [35]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [36]:
df=data_cleaning(df, target='Price')
Null_checking(df)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
null_count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
null_percentage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
df=Excessive_Null(df,percent_limit=70)
uniqueness_checking(df)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
unique_count,504,26,76,2,81,446,356,412,9,66,46,357,455,229
uniqueness_percentage,99.604743,5.13834,15.019763,0.395257,16.007905,88.142292,70.355731,81.422925,1.778656,13.043478,9.090909,70.55336,89.920949,45.256917
dtype,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64


In [38]:
df=severe_unique_int_obj_drop(df,percent_limit=99)
#drop_cols(df, cols=['Ticket'])

In [39]:
df=Null_filling_num(df,fill_by='mean')
df=Null_filling_cat(df,fill_by='mode')
Null_checking(df)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
null_count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
null_percentage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
df=skew_handling(df, trans= 'yeojohnson')

In [41]:
df=create_dummies(df)

In [42]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.006281,1.027774,2.31,-0.0,0.538,6.575,65.2,1.154678,0.610653,296.0,15.3,1757992000.0,4.98,4.286719
1,0.0266,-0.0,7.07,-0.0,0.469,6.421,78.9,1.228591,0.901042,242.0,17.8,1757992000.0,9.14,4.113627
2,0.026581,-0.0,7.07,-0.0,0.469,7.185,61.1,1.228591,0.901042,242.0,17.8,1691013000.0,4.03,4.922083
3,0.031378,-0.0,2.18,-0.0,0.458,6.998,45.8,1.301378,1.081963,222.0,18.7,1720399000.0,2.94,4.854119
4,0.064688,-0.0,2.18,-0.0,0.458,7.147,54.2,1.301378,1.081963,222.0,18.7,1757992000.0,5.33,4.997997


In [43]:
#separate target column
x=df.drop(columns=['Price'],axis=1)
y=df['Price']

In [44]:
x_scaled=scaling(x)
vif_checking(x)

{'CRIM': 7.845507475750273,
 'ZN': 2.124852518779755,
 'INDUS': 3.90215080195347,
 'CHAS': 1.0700549418192398,
 'NOX': 5.31408815499211,
 'RM': 1.8561528166480379,
 'AGE': 3.139584821132584,
 'DIS': 4.9569440302339745,
 'RAD': 4.193493408701416,
 'TAX': 5.953404300463283,
 'PTRATIO': 1.849794041418046,
 'B': 1.4466237994587048,
 'LSTAT': 2.857805995710779}

In [45]:
x_updated=vif_handling(x)
x_pca=pca_dim_reduc(x)

In [46]:
regression=LinearRegression()

In [47]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=357)
regression.fit(x_train,y_train)
regression.score(x_train,y_train)

0.7472169152846302

In [48]:
regression.score(x_test,y_test)

0.7729932706937284

In [49]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.25,random_state=357)
regression.fit(x_train,y_train)
regression.score(x_train,y_train)

0.7561330704349186

In [50]:
regression.score(x_test,y_test)

0.7823111481929368

In [51]:
x_train,x_test,y_train,y_test=train_test_split(x_pca,y,test_size=0.25,random_state=357)
regression.fit(x_train,y_train)
regression.score(x_train,y_train)

0.6901812972755426

In [52]:
regression.score(x_test,y_test)

0.72261476509096