### MISSING VALUES
1) Catching missing values


In [128]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

def load():
    data = pd.read_csv("datasets/titanic.csv")
    return data


df = load()

In [129]:
df.isnull().values.any()

True

In [130]:
df.isnull().sum().sort_values(ascending=False) #counts at least 1 NA column

Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
dtype: int64

In [131]:
(df.isnull().sum() / df.shape[0] * 100).sort_values(ascending=False) ##percentage

Cabin         77.104
Age           19.865
Embarked       0.224
PassengerId    0.000
Survived       0.000
Pclass         0.000
Name           0.000
Sex            0.000
SibSp          0.000
Parch          0.000
Ticket         0.000
Fare           0.000
dtype: float64

In [132]:
[col for col in df.columns if df[col].isnull().sum() > 0] ##get only var.s with NA

['Age', 'Cabin', 'Embarked']

In [133]:
def missing_values_tabel(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)

    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=["n_miss", "ratio"])
    print(missing_df, end="\n")

    if na_name:
        return na_columns
    
missing_values_tabel(df)

          n_miss  ratio
Cabin        687 77.100
Age          177 19.870
Embarked       2  0.220


2. Trying to deal with missing values

In [134]:
## 1- Drop NA 
df.dropna().shape

(183, 12)

In [135]:
## 2- Fill NA simply with mean or median
df["Age"].fillna(df["Age"].mean()).isnull().sum()   ##need to inplace to change

0

In [136]:
##df.apply(lambda x: x.fillna(x.mean()), axis=0) error for non numeric values
df = df.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
missing_values_tabel(df)

          n_miss  ratio
Cabin        687 77.100
Embarked       2  0.220


In [137]:
df.apply(lambda x : x.fillna(x.mode()[0]) if (x.dtype == "O" and
                                              len(x.unique()) <= 10) else x, axis=0).isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [138]:
##fill age variable with respect to the mean of the person's sex

df["Age"].fillna(df.groupby("Sex")["Age"].transform("mean")).isnull().sum()

0

In [139]:
## 3 - Based on prediction

In [140]:

def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
        Return catagorical, numeric, catagorical bu cardinal variable names from a dataframe.
        Note: Catagorical includes numeric values which have less than the cat_threshold (default -> less than 10)

    Parameters
    ------
        dataframe: dataframe
        cat_th: int, optional
        car_th: int, optinal
    Returns
    ------
        cat_cols: list
        num_cols: list
        cat_but_car: list

    Notes
    ------
        cat_cols + num_cols + cat_but_car = total var. number
        num_but_cat is included in cat_cols
    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    #is a number but has less than "cat_th" amount of unique values (counted as catagorical)
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    #not numerical but has more than "car_th" amount of unique values (not catagorical)
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    
    #cat_cols = cat_cols + num_but_cat - cat_but_car
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols -> numerical and not in
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car


In [141]:
df = load()

cat_cols , num_cols, cat_but_car = grab_col_names(df)
num_cols = [col for col in num_cols if col not in "PassengerId"]


Observations: 891
Variables: 12
cat_cols: 6
num_cols: 3
cat_but_car: 3
num_but_cat: 4


In [142]:
##standardize
dff = pd.get_dummies(df[cat_cols + num_cols], drop_first=True)
scaler = MinMaxScaler()
dff = pd.DataFrame(scaler.fit_transform(dff), columns=dff.columns)
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.125,0.0,0.271,0.014,1.0,0.0,1.0
1,1.0,0.0,0.125,0.0,0.472,0.139,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.321,0.015,0.0,0.0,1.0
3,1.0,0.0,0.125,0.0,0.435,0.104,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.435,0.016,1.0,0.0,1.0


In [None]:
## USING KNN ##
imputer = KNNImputer(n_neighbors=5)
dff = pd.DataFrame(imputer.fit_transform(dff), columns=dff.columns)
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.125,0.0,0.271,0.014,1.0,0.0,1.0
1,1.0,0.0,0.125,0.0,0.472,0.139,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.321,0.015,0.0,0.0,1.0
3,1.0,0.0,0.125,0.0,0.435,0.104,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.435,0.016,1.0,0.0,1.0
