### **Tahmine Dayalı Atama İşlemi**

Makine öğrenmesine dayalı olarak gerçekleşir sadece fikir sahibi olmanız için bu bölüm oluşturulmuştur

KNN kullanacağız.

In [2]:
##################### GEÇMİŞ TANIMLAMALAR ##############################

# Kütüphaneler
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
# pip install missingno
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler


# Pandas ayarları
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)


# Veri setleri
def load_application_train():
    data = pd.read_csv('../01-outliers/application_train.csv')
    return data

def load():
    data = pd.read_csv('../01-outliers/titanic.csv')
    return data


# Fonksiyonlar
# Eksik değer içeren değişkenleri dönen fonksiyon:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df,end='\n')
    if na_name:
        return na_columns


def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir.

    Parameters
    -------
        dataframe: dataframe
                Değişken isimleri alınmak istenilen dataframe.
        cat_th: int, optinal
                Kategorik değişkenlerinin sayısının üst sınırı.
        car_th: int, optional
                Kategorik fakat kardinal değişkenler için sınıf eşik değeri.
    
    Returns
    -------
        cat_cols: list
                Kategorik değişken listesi.
        num_cols: list
                Numerik değişken listesi.
        cat_but_car: list
                Kategorik kardinal değişken listesi.
    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == 'O']
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtype != 'O']
                   
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                     dataframe[col].dtype == 'O']
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != 'O']
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print("******************")
    print(f"num_but_cat: {len(num_but_cat)}")
    return cat_cols, num_cols, cat_but_car

In [3]:
df = load()

cat_cols, num_cols, cat_but_car = grab_col_names(df)
num_cols = [col for col in num_cols if col not in "PassengerId"]

Observations: 891
Variables: 12
cat_cols: 6
num_cols: 3
cat_but_car: 3
******************
num_but_cat: 4


In [4]:
# Encoding bölümünde detaylı anlatılacak olan kategorik değişkenlerin sınıflarını numerik hale getiren ifade:
dff = pd.get_dummies(df[cat_cols + num_cols],drop_first=True)

In [5]:
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,1,0,22.0,7.25,1,0,1
1,1,1,1,0,38.0,71.283,0,0,0
2,1,3,0,0,26.0,7.925,0,0,1
3,1,1,1,0,35.0,53.1,0,0,1
4,0,3,0,0,35.0,8.05,1,0,1


In [6]:
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,1,0,22.0,7.25,1,0,1
1,1,1,1,0,38.0,71.283,0,0,0
2,1,3,0,0,26.0,7.925,0,0,1
3,1,1,1,0,35.0,53.1,0,0,1
4,0,3,0,0,35.0,8.05,1,0,1


In [7]:
# Tahmine dayalı atama işlemi yapabilmemiz için değişkenleri standarlaştırmalıyız:
scaler = MinMaxScaler()
dff = pd.DataFrame(scaler.fit_transform(dff), columns=dff.columns)
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.125,0.0,0.271,0.014,1.0,0.0,1.0
1,1.0,0.0,0.125,0.0,0.472,0.139,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.321,0.015,0.0,0.0,1.0
3,1.0,0.0,0.125,0.0,0.435,0.104,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.435,0.016,1.0,0.0,1.0


In [8]:
# KNN uygulanması:

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
dff = pd.DataFrame(imputer.fit_transform(dff), columns=dff.columns)
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.125,0.0,0.271,0.014,1.0,0.0,1.0
1,1.0,0.0,0.125,0.0,0.472,0.139,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.321,0.015,0.0,0.0,1.0
3,1.0,0.0,0.125,0.0,0.435,0.104,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.435,0.016,1.0,0.0,1.0


In [9]:
# Doldurulan yerleri görmek için standartlaştırma işlemini geri alalım.
dff = pd.DataFrame(scaler.inverse_transform(dff), columns=dff.columns)
dff.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,3.0,1.0,0.0,22.0,7.25,1.0,0.0,1.0
1,1.0,1.0,1.0,0.0,38.0,71.283,0.0,0.0,0.0
2,1.0,3.0,0.0,0.0,26.0,7.925,0.0,0.0,1.0
3,1.0,1.0,1.0,0.0,35.0,53.1,0.0,0.0,1.0
4,0.0,3.0,0.0,0.0,35.0,8.05,1.0,0.0,1.0


In [10]:
# Şimdi age_imputed_column değişkeni oluşturup yeni age değişkenini atayalım:
df["age_imputed_column"] = dff[["Age"]]

In [14]:

df["Age"].isnull().value_counts()

False    714
True     177
Name: Age, dtype: int64

In [12]:
# TEKRARLAYALIM

# Veri setini tanımlayalım.
df = load()

# Sayısal değişkenleri direk medyan ile doldurma
df.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0).isnull().sum()

# Kategorik değişkenleri mode ile doldurma
df.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) < 10) else x, axis=0).isnull().sum()

# Kategorik değişken kırılımında sayısal değişkenleri doldurmak
df["Age"].fillna(df.groupby("Sex")["Age"].transform("mean")).isnull().sum()

0