In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
!pip install missingno
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler




In [36]:
def load():
    df=pd.read_csv("datasets/titanic.csv")
    return df

df=load()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [37]:
df.columns=[col.upper() for col in df.columns]
df.head()

Unnamed: 0,PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,CABIN,EMBARKED
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Feature Engineering

In [38]:
df["NEW_CABIN_BOOL"]=df["CABIN"].notnull().astype("int")
df["NEW_NAME_COUNT"]=df["NAME"].str.len()
df["NEW_NAME_WORD_COUNT"]=df["NAME"].apply(lambda x: len(str(x).split(" ")))
df["NEW_NAME_DR"]=df["NAME"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))
df["NEW_TITLE"]=df.NAME.str.extract(" ([A-Za-z]+)/.",expand=False)
df["NEW_FAMILY_SIZE"]=df["SIBSP"]+df["PARCH"]+1
df["NEW_AGE_PCLASS"]=df["AGE"]*df["PCLASS"]
df.loc[((df["SIBSP"]+df["PARCH"])>0),"NEW_IS_ALONE"]="NO"
df.loc[((df["SIBSP"]+df["PARCH"])==0),"NEW_IS_ALONE"]="YES"
df.loc[(df["AGE"]<18),"NEW_AGE_CAT"]="young"
df.loc[(df["AGE"]>=18) & (df["AGE"]<56),"NEW_AGE_CAT"]="mature"
df.loc[(df["AGE"]>=56),"NEW_AGE_CAT"]="senior"
df.loc[(df["AGE"]<21) & (df["SEX"]=="male"),"NEW_SEX_CAT"]="youngmale"
df.loc[(df["AGE"]<21) & (df["SEX"]=="female"),"NEW_SEX_CAT"]="youngfemale"
df.loc[((df["AGE"]>=21) & (df["AGE"]<=50)) & (df["SEX"]=="male"),"NEW_SEX_CAT"]="maturemale"
df.loc[((df["AGE"]>=21) & (df["AGE"]<=50)) & (df["SEX"]=="female"),"NEW_SEX_CAT"]="maturefemale"
df.loc[(df["AGE"]>50) & (df["SEX"]=="male"),"NEW_SEX_CAT"]="seniormale"
df.loc[(df["AGE"]>50) & (df["SEX"]=="female"),"NEW_SEX_CAT"]="seniorfemale"

df.head()

Unnamed: 0,PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,...,NEW_CABIN_BOOL,NEW_NAME_COUNT,NEW_NAME_WORD_COUNT,NEW_NAME_DR,NEW_TITLE,NEW_FAMILY_SIZE,NEW_AGE_PCLASS,NEW_IS_ALONE,NEW_AGE_CAT,NEW_SEX_CAT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,23,4,0,,2,66.0,NO,mature,maturemale
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,1,51,7,0,,2,38.0,NO,mature,maturefemale
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,0,22,3,0,,1,78.0,YES,mature,maturefemale
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,1,44,7,0,,2,35.0,NO,mature,maturefemale
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,24,4,0,,1,105.0,YES,mature,maturemale


In [39]:
def grab_col_names(dataframe,cat_th=10,car_th=20):
    cat_cols=[col for col in dataframe.columns if dataframe[col].dtypes=="O"]
    num_but_cat=[col for col in dataframe.columns if (dataframe[col].nunique()<cat_th)&(dataframe[col].dtypes!="O")]
    cat_but_car=[col for col in dataframe.columns if (dataframe[col].dtypes=="O")&(dataframe[col].nunique()>car_th)]
    cat_cols=cat_cols+num_but_cat
    cat_cols=[col for col in cat_cols if col not in cat_but_car]
    num_cols=[col for col in dataframe.columns if dataframe[col].dtypes!="O"]
    num_cols=[col for col in num_cols if col not in num_but_cat]
    return cat_cols,num_cols,cat_but_car

In [40]:
cat_cols,num_cols,cat_but_car=grab_col_names(df)
num_cols
num_cols=[col for col in num_cols if col not in "PASSENGERID"]
num_cols

['AGE', 'FARE', 'NEW_NAME_COUNT', 'NEW_AGE_PCLASS']

In [43]:
def outlier_thresholds(dataframe,col_name,q1=0.25,q3=0.75):
    quartile1=dataframe[col_name].quantile(q1)
    quartile3=dataframe[col_name].quantile(q3)
    iqr=quartile3-quartile1
    low_limit=quartile1+1.5*iqr
    up_limit=quartile3+1.5*iqr
    return low_limit,up_limit

def check_outlier(dataframe,col_name):
    low_limit,up_limit=outlier_thresholds(dataframe,col_name)
    if dataframe[(dataframe[col_name]<low_limit) | (dataframe[col_name]>up_limit)].any(axis=None):
        return True
    else:
        return False


In [44]:
for col in num_cols:
    print(col,check_outlier(df,col))

AGE True
FARE True
NEW_NAME_COUNT True
NEW_AGE_PCLASS True


In [45]:
def replace_with_thresholds(dataframe,variable):
    low_limit,up_limit=outlier_thresholds(dataframe,variable)
    dataframe.loc[(dataframe[variable]<low_limit),variable]=low_limit
    dataframe.loc[(dataframe[variable]>up_limit),variable]=up_limit


In [48]:
for col in num_cols:
    replace_with_thresholds(df,col)

In [49]:
for col in num_cols:
    print(col,check_outlier(df,col))

AGE False
FARE False
NEW_NAME_COUNT False
NEW_AGE_PCLASS False


In [50]:
def missing_values_table(dataframe,na_name=False):
    na_columns=[col for col in dataframe.columns if dataframe[col].isnull().sum()>0]
    n_miss= dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio= (dataframe[na_columns].isnull().sum()/ dataframe.shape[0]*100).sort_values(ascending=False)
    missing_df= pd.concat([n_miss,np.round(ratio,2)],axis=1,keys=["n_miss","ratio"])
    print(missing_df,end="\n")

In [51]:
missing_values_table(df)

                n_miss  ratio
NEW_TITLE          890  99.89
CABIN              687  77.10
AGE                177  19.87
NEW_AGE_PCLASS     177  19.87
NEW_AGE_CAT        177  19.87
NEW_SEX_CAT        177  19.87
EMBARKED             2   0.22
