###########################################<br>
FEATURE ENGINEERING & DATA PRE-PROCESSING<br>
###########################################

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

!pip install missingno

In [None]:
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [None]:
def load_application_train():
    data = pd.read_csv("datasets/Feature Engineering/application_train.csv")
    return data

In [None]:
df = load_application_train()
df.head()

In [None]:
def load():
    data = pd.read_csv("datasets/Feature Engineering/titanic.csv")
    return data

In [None]:
df = load()
df.head()

###########################################<br>
1. Outliers (AykÄ±rÄ± DeÄŸerler)<br>
###########################################

###########################################<br>
AykÄ±rÄ± DeÄŸerleri Yakalama<br>
###########################################

#################<br>
Grafik Teknikle AykÄ±rÄ± DeÄŸerler<br>
#################

In [None]:
sns.boxplot(x=df["Age"])
plt.show()

#################<br>
AykÄ±rÄ± DeÄŸerler NasÄ±l YakalanÄ±r?<br>
#################

In [None]:
q1 = df["Age"].quantile(0.25)
q3 = df["Age"].quantile(0.75)
iqr = q3 - q1
up = q3 + 1.5 * iqr
low = q1 - 1.5 * iqr

In [None]:
df[(df["Age"] < low) | (df["Age"] > up)] #istenilen Ã¶zellikleri saÄŸlayan satÄ±rlarÄ± getirdik

In [None]:
df[(df["Age"] < low) | (df["Age"] > up)].index #bu satÄ±rlarÄ±n indexlerini aldÄ±k

#################<br>
AykÄ±rÄ± DeÄŸer Var mÄ± Yok mu?<br>
#################

In [None]:
df[(df["Age"] < low) | (df["Age"] > up)].any(axis=None) #any metodu ile burada deÄŸer var mÄ± diye soruyoruz satÄ±r ya da sÃ¼runa deÄŸil hepsine bakÄ±yoruz
df[(df["Age"] < low)].any(axis=None) # gÃ¶zlem var mÄ± yok mu diye sorguluyoruz

1. EÅŸik deÄŸer belirledik.<br>
2. AykÄ±rÄ±lara eriÅŸtik.<br>
3. HÄ±zlÄ±ca aykÄ±rÄ± deÄŸer var mÄ± yok diye sorduk.

#################<br>
Ä°ÅŸlemleri FonksiyonlaÅŸtÄ±rmak<br>
#################

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
outlier_thresholds(df, "Age")
outlier_thresholds(df, "Fare")

In [None]:
low, up = outlier_thresholds(df, "Fare")

In [None]:
df[(df["Fare"] < low) | (df["Fare"] > up)].head()

In [None]:
df[(df["Fare"] < low) | (df["Fare"] > up)].index

In [None]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
check_outlier(df, "Age")
check_outlier(df, "Fare")

#################<br>
grab_col_names<br>
#################

In [None]:
dff = load_application_train()
dff.head()

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal deÄŸiÅŸkenlerin isimlerini verir.
    Not: Kategorik deÄŸiÅŸkenlerin iÃ§erisine numerik gÃ¶rÃ¼nÃ¼mlÃ¼ kategorik deÄŸiÅŸkenler de dahildir.
    Parameters
    ------
        dataframe: dataframe
                DeÄŸiÅŸken isimleri alÄ±nmak istenilen dataframe
        cat_th: int, optional
                numerik fakat kategorik olan deÄŸiÅŸkenler iÃ§in sÄ±nÄ±f eÅŸik deÄŸeri
        car_th: int, optinal
                kategorik fakat kardinal deÄŸiÅŸkenler iÃ§in sÄ±nÄ±f eÅŸik deÄŸeri
    Returns
    ------
        cat_cols: list
                Kategorik deÄŸiÅŸken listesi
        num_cols: list
                Numerik deÄŸiÅŸken listesi
        cat_but_car: list
                Kategorik gÃ¶rÃ¼nÃ¼mlÃ¼ kardinal deÄŸiÅŸken listesi
    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))

In [None]:
    Notes
    ------
        cat_cols + num_cols + cat_but_car = toplam deÄŸiÅŸken sayÄ±sÄ±
        num_but_cat cat_cols'un iÃ§erisinde.
        Return olan 3 liste toplamÄ± toplam deÄŸiÅŸken sayÄ±sÄ±na eÅŸittir: cat_cols + num_cols + cat_but_car = deÄŸiÅŸken sayÄ±sÄ±
    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
num_cols = [col for col in num_cols if col not in "PassengerId"]

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(dff)

In [None]:
num_cols = [col for col in num_cols if col not in "SK_ID_CURR"]

In [None]:
for col in num_cols:
    print(col, check_outlier(dff, col))

#################<br>
AykÄ±rÄ± DeÄŸerlerin Kendilerine EriÅŸmek<br>
#################

In [None]:
def grab_outliers(dataframe, col_name, index=False):
    low, up = outlier_thresholds(dataframe, col_name)
    if dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].shape[0] > 10:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].head())
    else:
        print(dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))])
    if index:
        outlier_index = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))].index
        return outlier_index

In [None]:
grab_outliers(df, "Age")

In [None]:
grab_outliers(df, "Age", True)

In [None]:
age_index = grab_outliers(df, "Age", True)

In [None]:
outlier_thresholds(df, "Age")
check_outlier(df, "Age")
grab_outliers(df, "Age", True)

###########################################<br>
AykÄ±rÄ± DeÄŸer Problemini Ã‡Ã¶zme<br>
###########################################

#################<br>
Silme<br>
#################

In [None]:
low, up = outlier_thresholds(df, "Fare")
df.shape

In [None]:
df[~((df["Fare"] < low) | (df["Fare"] > up))].shape

In [None]:
def remove_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]
    return df_without_outliers

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
num_cols = [col for col in num_cols if col not in "PassengerId"]

In [None]:
df.shape

In [None]:
for col in num_cols:
    new_df = remove_outlier(df, col)

In [None]:
df.shape[0] - new_df.shape[0]

#################<br>
BaskÄ±lama YÃ¶ntemi (re-assignment with thresholds)<br>
#################

In [None]:
low, up = outlier_thresholds(df, "Fare")

In [None]:
df[((df["Fare"] < low) | (df["Fare"] > up))]["Fare"]

In [None]:
df.loc[((df["Fare"] < low) | (df["Fare"] > up)), "Fare"]

In [None]:
df.loc[(df["Fare"] > up), "Fare"] = up

In [None]:
df.loc[(df["Fare"] < low), "Fare"] = low

In [None]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
df = load()
cat_cols, num_cols, cat_but_car = grab_col_names(df)
num_cols = [col for col in num_cols if col not in "PassengerId"]

In [None]:
df.shape

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
for col in num_cols:
    replace_with_thresholds(df, col)

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

#################<br>
Recap<br>
#################

In [None]:
df = load()
outlier_thresholds(df, "Age")
check_outlier(df, "Age")
grab_outliers(df, "Age", index=True)

In [None]:
remove_outlier(df, "Age").shape
replace_with_thresholds(df, "Age")
check_outlier(df, "Age")

###########################################<br>
Ã‡ok DeÄŸiÅŸkenli AykÄ±rÄ± DeÄŸer Analizi: Local Outlier Factor<br>
###########################################

17, 3

In [None]:
df = sns.load_dataset('diamonds')
df = df.select_dtypes(include=['float64', 'int64'])
df = df.dropna()
df.head()
df.shape
for col in df.columns:
    print(col, check_outlier(df, col))

In [None]:
low, up = outlier_thresholds(df, "carat")

In [None]:
df[((df["carat"] < low) | (df["carat"] > up))].shape

In [None]:
low, up = outlier_thresholds(df, "depth")

In [None]:
df[((df["depth"] < low) | (df["depth"] > up))].shape

In [None]:
clf = LocalOutlierFactor(n_neighbors=20)
clf.fit_predict(df) # bu faktÃ¶rÃ¼n belirlediÄŸi skorlarÄ± getirdi

In [None]:
df_scores = clf.negative_outlier_factor_ #skorlarÄ± kaydetmiÅŸ olduk
df_scores[0:5]
# df_scores = -df_scores #skorlarÄ± pozitife Ã§evirebiliriz.
#EÅŸik deÄŸere karar vermek isteyince daha rahat olacaÄŸÄ±ndan skorlarÄ± negatif bÄ±raktÄ±k
np.sort(df_scores)[0:5]

In [None]:
scores = pd.DataFrame(np.sort(df_scores))
scores.plot(stacked=True, xlim=[0, 50], style='.-')
plt.show()
#Elbow yÃ¶ntemi: eÅŸik deÄŸerin kararÄ± iÃ§in
# Grafikten bakÄ±yoruz. En dik eÄŸim deÄŸiÅŸikliÄŸini belirledik
# En bÃ¼yÃ¼k deÄŸiÅŸimin olduÄŸu yeri eÅŸik deÄŸer olarak seÃ§tik
# bu 3. deÄŸer olduÄŸundan aÅŸaÄŸÄ±da onun deÄŸerini seÃ§tik
th = np.sort(df_scores)[3]

In [None]:
df[df_scores < th]

In [None]:
df[df_scores < th].shape

In [None]:
df.describe([0.01, 0.05, 0.75, 0.90, 0.99]).T

In [None]:
df[df_scores < th].index

In [None]:
df[df_scores < th].drop(axis=0, labels=df[df_scores < th].index)

###########################################<br>
Missing Values (Eksik DeÄŸerler)<br>
###########################################

###########################################<br>
Eksik DeÄŸerlerin YakalanmasÄ±<br>
###########################################

In [None]:
df = load()
df.head()

eksik gozlem var mÄ± yok mu sorgusu

In [None]:
df.isnull().values.any()

degiskenlerdeki eksik deger sayisi

In [None]:
df.isnull().sum()

degiskenlerdeki tam deger sayisi

In [None]:
df.notnull().sum()

veri setindeki toplam eksik deger sayisi

In [None]:
df.isnull().sum().sum()

en az bir tane eksik degere sahip olan gÃ¶zlem birimleri

In [None]:
df[df.isnull().any(axis=1)]

tam olan gÃ¶zlem birimleri

In [None]:
df[df.notnull().all(axis=1)]

Azalan ÅŸekilde sÄ±ralamak

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
(df.isnull().sum() / df.shape[0] * 100).sort_values(ascending=False)

In [None]:
na_cols = [col for col in df.columns if df[col].isnull().sum() > 0]

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns

In [None]:
missing_values_table(df)

In [None]:
missing_values_table(df, True)

###########################################<br>
Eksik DeÄŸer Problemini Ã‡Ã¶zme<br>
###########################################

In [None]:
missing_values_table(df)

#################<br>
Ã‡Ã¶zÃ¼m 1: HÄ±zlÄ±ca silmek<br>
#################

In [None]:
df.dropna().shape
#Herhangi bir satÄ±rda en az bir eksik deÄŸer varsa o satÄ±rÄ± siler.

#################<br>
Ã‡Ã¶zÃ¼m 2: Basit Atama YÃ¶ntemleri ile Doldurmak<br>
#################

In [None]:
df["Age"].fillna(df["Age"].mean()).isnull().sum()
df["Age"].fillna(df["Age"].median()).isnull().sum()
df["Age"].fillna(0).isnull().sum()

df.apply(lambda x: x.fillna(x.mean()), axis=0)

In [None]:
df.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0).head()

In [None]:
dff = df.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)

In [None]:
dff.isnull().sum().sort_values(ascending=False)

In [None]:
df["Embarked"].fillna(df["Embarked"].mode()[0]).isnull().sum()

In [None]:
df["Embarked"].fillna("missing")

In [None]:
df.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= 10) else x, axis=0).isnull().sum()

#################<br>
Kategorik DeÄŸiÅŸken KÄ±rÄ±lÄ±mÄ±nda DeÄŸer Atama<br>
#################

In [None]:
df.groupby("Sex")["Age"].mean()

In [None]:
df["Age"].mean()

In [None]:
df["Age"].fillna(df.groupby("Sex")["Age"].transform("mean")).isnull().sum()
#transform mean e gÃ¶re deÄŸiÅŸiklik yapÄ±yor

In [None]:
df.groupby("Sex")["Age"].mean()["female"]

In [None]:
df.loc[(df["Age"].isnull()) & (df["Sex"]=="female"), "Age"] = df.groupby("Sex")["Age"].mean()["female"]

In [None]:
df.loc[(df["Age"].isnull()) & (df["Sex"]=="male"), "Age"] = df.groupby("Sex")["Age"].mean()["male"]

In [None]:
df.isnull().sum()

###########################################<br>
Ã‡Ã¶zÃ¼m 3: Tahmine DayalÄ± Atama ile Doldurma<br>
###########################################

In [None]:
df = load()

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)
num_cols = [col for col in num_cols if col not in "PassengerId"]
#kategorik deÄŸiÅŸkenler modelin beklediÄŸi hale getiriliyor.
# get_dummies ile encoding yapÄ±lÄ±yor
dff = pd.get_dummies(df[cat_cols + num_cols], drop_first=True)
# tÃ¼m kategorik deÄŸiÅŸkenleri binary tutabileceÄŸiz.
dff.head()

deÄŸiÅŸkenlerin standartlatÄ±rÄ±lmasÄ±

In [None]:
scaler = MinMaxScaler()
dff = pd.DataFrame(scaler.fit_transform(dff), columns=dff.columns)
dff.head()

knn'in uygulanmasÄ±.

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
dff = pd.DataFrame(imputer.fit_transform(dff), columns=dff.columns)
dff.head()
#uzaklÄ±k temellidir
#en yakÄ±n 5 komÅŸuluÄŸun dolu olan gÃ¶zlemlerinin ortalamasÄ±nÄ± eksik deÄŸerlere koyuyor

In [None]:
dff = pd.DataFrame(scaler.inverse_transform(dff), columns=dff.columns)
#standartlaÅŸmayÄ± geri aldÄ±k

In [None]:
df["age_imputed_knn"] = dff[["Age"]]

In [None]:
df.loc[df["Age"].isnull(), ["Age", "age_imputed_knn"]]
df.loc[df["Age"].isnull()]

#################<br>
Recap<br>
#################

In [None]:
df = load()
# missing table
missing_values_table(df)
# sayÄ±sal deÄŸiÅŸkenleri direk median ile oldurma
df.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0).isnull().sum()
# kategorik deÄŸiÅŸkenleri mode ile doldurma
df.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= 10) else x, axis=0).isnull().sum()
# kategorik deÄŸiÅŸken kÄ±rÄ±lÄ±mÄ±nda sayÄ±sal deÄŸiÅŸkenleri doldurmak
df["Age"].fillna(df.groupby("Sex")["Age"].transform("mean")).isnull().sum()
# Tahmine DayalÄ± Atama ile Doldurma

###########################################<br>
GeliÅŸmiÅŸ Analizler<br>
###########################################

#################<br>
Eksik Veri YapÄ±sÄ±nÄ±n Ä°ncelenmesi<br>
#################

In [None]:
msno.bar(df)
plt.show()

In [None]:
msno.matrix(df)
plt.show()

In [None]:
msno.heatmap(df)
plt.show()

#################<br>
Eksik DeÄŸerlerin BaÄŸÄ±mlÄ± DeÄŸiÅŸken ile Ä°liÅŸkisinin Ä°ncelenmesi<br>
#################

In [None]:
missing_values_table(df, True)
na_cols = missing_values_table(df, True)

In [None]:
def missing_vs_target(dataframe, target, na_columns):
    temp_df = dataframe.copy()
    for col in na_columns:
        temp_df[col + '_NA_FLAG'] = np.where(temp_df[col].isnull(), 1, 0)
    na_flags = temp_df.loc[:, temp_df.columns.str.contains("_NA_")].columns
    for col in na_flags:
        print(pd.DataFrame({"TARGET_MEAN": temp_df.groupby(col)[target].mean(),
                            "Count": temp_df.groupby(col)[target].count()}), end="\n\n\n")

In [None]:
missing_vs_target(df, "Survived", na_cols)

#################<br>
Recap<br>
#################

In [None]:
df = load()
na_cols = missing_values_table(df, True)
# sayÄ±sal deÄŸiÅŸkenleri direk median ile oldurma
df.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0).isnull().sum()
# kategorik deÄŸiÅŸkenleri mode ile doldurma
df.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= 10) else x, axis=0).isnull().sum()
# kategorik deÄŸiÅŸken kÄ±rÄ±lÄ±mÄ±nda sayÄ±sal deÄŸiÅŸkenleri doldurmak
df["Age"].fillna(df.groupby("Sex")["Age"].transform("mean")).isnull().sum()
# Tahmine DayalÄ± Atama ile Doldurma
missing_vs_target(df, "Survived", na_cols)

###########################################<br>
3. Encoding (Label Encoding, One-Hot Encoding, Rare Encoding)<br>
###########################################

###########################################<br>
Label Encoding & Binary Encoding<br>
###########################################

In [None]:
df = load()
df.head()
df["Sex"].head()

In [None]:
le = LabelEncoder() #labelencoder nesnesini getirdik
le.fit_transform(df["Sex"])[0:5] # label encoder nesnesini belirtilen deÄŸiÅŸkene uygular(fit etmek) ve sonrasÄ±nda deÄŸerleri dÃ¶nÃ¼ÅŸtÃ¼rÃ¼r (transform etmek)
#alfabetik olarak numaralandÄ±rÄ±yor
le.inverse_transform([0, 1])
#bilgiler le nesnesinde tutuluyor. tersi ile hangi deÄŸerin hangi deÄŸiÅŸken olduÄŸunu Ã¶ÄŸreniriz.

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [None]:
df = load()

In [None]:
binary_cols = [col for col in df.columns if df[col].dtype not in [int, float]
               and df[col].nunique() == 2]

In [None]:
for col in binary_cols:
    label_encoder(df, col)

In [None]:
df.head()

In [None]:
df = load_application_train()
df.shape
df.dtypes

In [None]:
binary_cols = [col for col in df.columns if df[col].dtype not in ["int64", "float64"] and df[col].nunique() == 2]

In [None]:
df[binary_cols].head()

In [None]:
for col in binary_cols:
    label_encoder(df, col)
#NA lere de deÄŸer atandÄ±

In [None]:
df = load()
df["Embarked"].value_counts()
df["Embarked"].nunique()
len(df["Embarked"].unique())

###########################################<br>
One-Hot Encoding<br>
###########################################

In [None]:
df = load()
df.head()
df["Embarked"].value_counts()
#sÄ±nÄ±flar arasÄ±nda bir farklÄ±lÄ±k yok- nominal
# lqabel encoding problem yaratÄ±lÄ±r
# get dummies- one hot encoding iÃ§in

In [None]:
pd.get_dummies(df, columns=["Embarked"]).head()
# DeÄŸiÅŸkenler birbiri Ã¼zerinden Ã¼retilmesin diye
pd.get_dummies(df, columns=["Embarked"], drop_first=True).head()
# ilgili deÄŸiÅŸkenlerdeki eksik deÄŸerleri de sÄ±nÄ±f kabul ettik
pd.get_dummies(df, columns=["Embarked"], dummy_na=True).head()
# label encodera gerek kalmadan bianry encoder yapar drop first true seÃ§ilirse
pd.get_dummies(df, columns=["Sex", "Embarked"], drop_first=True).head()

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [None]:
df = load()

cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
ohe_cols = [col for col in df.columns if 10 >= df[col].nunique() > 2]

In [None]:
one_hot_encoder(df, ohe_cols).head()

In [None]:
df.head()

###########################################<br>
Rare Encoding<br>
###########################################

1. Kategorik deÄŸiÅŸkenlerin azlÄ±k Ã§okluk durumunun analiz edilmesi.<br>
2. Rare kategoriler ile baÄŸÄ±mlÄ± deÄŸiÅŸken arasÄ±ndaki iliÅŸkinin analiz edilmesi.<br>
3. Rare encoder yazacaÄŸÄ±z.

#################<br>
1. Kategorik deÄŸiÅŸkenlerin azlÄ±k Ã§okluk durumunun analiz edilmesi.<br>
#################

In [None]:
df = load_application_train()
df["NAME_EDUCATION_TYPE"].value_counts()

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show()

In [None]:
for col in cat_cols:
    cat_summary(df, col)

#################<br>
2. Rare kategoriler ile baÄŸÄ±mlÄ± deÄŸiÅŸken arasÄ±ndaki iliÅŸkinin analiz edilmesi.<br>
#################

In [None]:
df["NAME_INCOME_TYPE"].value_counts()

In [None]:
df.groupby("NAME_INCOME_TYPE")["TARGET"].mean()

In [None]:
def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")

In [None]:
rare_analyser(df, "TARGET", cat_cols)

###########################################<br>
3. Rare encoder'Ä±n yazÄ±lmasÄ±.<br>
###########################################

In [None]:
def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()
    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]
    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])
    return temp_df

In [None]:
new_df = rare_encoder(df, 0.01)

In [None]:
rare_analyser(new_df, "TARGET", cat_cols)

In [None]:
df["OCCUPATION_TYPE"].value_counts()

###########################################<br>
Feature Scaling (Ã–zellik Ã–lÃ§eklendirme)<br>
###########################################

#################<br>
StandardScaler:<br>
Klasik standartlaÅŸtÄ±rma. NormalleÅŸtirme. z standartlaÅŸtÄ±rma<br>
OrtalamayÄ± Ã§Ä±kar, standart sapmaya bÃ¶l. z = (x - u) / s<br>
#################

In [None]:
df = load()
ss = StandardScaler()
df["Age_standard_scaler"] = ss.fit_transform(df[["Age"]])
df.head()

#################<br>
RobustScaler:<br>
MedyanÄ± Ã§Ä±kar iqr'a bÃ¶l.<br>
aykÄ±rÄ± deÄŸerlerden etkilenmezler<br>
yaygÄ±n deÄŸildir.<br>
#################

In [None]:
rs = RobustScaler()
df["Age_robuts_scaler"] = rs.fit_transform(df[["Age"]])
df.describe().T

#################<br>
MinMaxScaler:<br>
Verilen 2 deÄŸer arasÄ±nda deÄŸiÅŸken dÃ¶nÃ¼ÅŸÃ¼mÃ¼<br>
#################

X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))<br>
X_scaled = X_std * (max - min) + min

In [None]:
mms = MinMaxScaler()
df["Age_min_max_scaler"] = mms.fit_transform(df[["Age"]])
df.describe().T

In [None]:
df.head()

In [None]:
age_cols = [col for col in df.columns if "Age" in col]

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)
    if plot:
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

In [None]:
for col in age_cols:
    num_summary(df, col, plot=True)

#################<br>
Numeric to Categorical: SayÄ±sal DeÄŸiÅŸkenleri Kateorik DeÄŸiÅŸkenlere Ã‡evirme<br>
Binning<br>
#################<br>
Ã¼Ã§Ã¼kten bÃ¼yÃ¼ÄŸe sÄ±ralar ve Ã§eyrek deÄŸerlere gÃ¶re 5 parÃ§aya bÃ¶ler.

In [None]:
df["Age_qcut"] = pd.qcut(df['Age'], 5)

###########################################<br>
Feature Extraction (Ã–zellik Ã‡Ä±karÄ±mÄ±)<br>
###########################################

###########################################<br>
Binary Features: Flag, Bool, True-False<br>
###########################################

In [None]:
df = load()
df.head()

In [None]:
df["NEW_CABIN_BOOL"] = df["Cabin"].notnull().astype('int')

In [None]:
df.groupby("NEW_CABIN_BOOL").agg({"Survived": "mean"})

In [None]:
from statsmodels.stats.proportion import proportions_ztest

In [None]:
test_stat, pvalue = proportions_ztest(count=[df.loc[df["NEW_CABIN_BOOL"] == 1, "Survived"].sum(),
                                             df.loc[df["NEW_CABIN_BOOL"] == 0, "Survived"].sum()],
                                      nobs=[df.loc[df["NEW_CABIN_BOOL"] == 1, "Survived"].shape[0],
                                            df.loc[df["NEW_CABIN_BOOL"] == 0, "Survived"].shape[0]])

In [None]:
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

p1 ve p2 arasÄ±nda fark yoktur p<0.05<br>
ipotez reddedilir.<br>
farklÄ±dÄ±r

In [None]:
df.loc[((df['SibSp'] + df['Parch']) > 0), "NEW_IS_ALONE"] = "NO"
df.loc[((df['SibSp'] + df['Parch']) == 0), "NEW_IS_ALONE"] = "YES"

In [None]:
df.groupby("NEW_IS_ALONE").agg({"Survived": "mean"})

In [None]:
test_stat, pvalue = proportions_ztest(count=[df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].sum(),
                                             df.loc[df["NEW_IS_ALONE"] == "NO", "Survived"].sum()],
                                      nobs=[df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].shape[0],
                                            df.loc[df["NEW_IS_ALONE"] == "NO", "Survived"].shape[0]])

In [None]:
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

###########################################<br>
Text'ler Ãœzerinden Ã–zellik TÃ¼retmek<br>
###########################################

In [None]:
df.head()

#################<br>
Letter Count<br>
#################

In [None]:
df["NEW_NAME_COUNT"] = df["Name"].str.len()

#################<br>
Word Count<br>
#################

In [None]:
df["NEW_NAME_WORD_COUNT"] = df["Name"].apply(lambda x: len(str(x).split(" ")))

#################<br>
Ã–zel YapÄ±larÄ± Yakalamak<br>
#################

In [None]:
df["NEW_NAME_DR"] = df["Name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))

In [None]:
df.groupby("NEW_NAME_DR").agg({"Survived": ["mean","count"]})

#################<br>
Regex ile DeÄŸiÅŸken TÃ¼retmek<br>
#################

In [None]:
df.head()

In [None]:
df['NEW_TITLE'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
df[["NEW_TITLE", "Survived", "Age"]].groupby(["NEW_TITLE"]).agg({"Survived": "mean", "Age": ["count", "mean"]})

###########################################<br>
Date DeÄŸiÅŸkenleri Ãœretmek<br>
###########################################

In [None]:
dff = pd.read_csv("datasets/Feature Engineering/course_reviews.csv")
dff.head()
dff.info()

In [None]:
dff['Timestamp'] = pd.to_datetime(dff["Timestamp"], format="%Y-%m-%d")

year

In [None]:
dff['year'] = dff['Timestamp'].dt.year

month

In [None]:
dff['month'] = dff['Timestamp'].dt.month

year diff

In [None]:
dff['year_diff'] = date.today().year - dff['Timestamp'].dt.year

month diff (iki tarih arasÄ±ndaki ay farkÄ±): yÄ±l farkÄ± + ay farkÄ±

In [None]:
dff['month_diff'] = (date.today().year - dff['Timestamp'].dt.year) * 12 + date.today().month - dff['Timestamp'].dt.month

day name

In [None]:
dff['day_name'] = dff['Timestamp'].dt.day_name()

In [None]:
dff.head()

date

###########################################<br>
Feature Interactions (Ã–zellik EtkileÅŸimleri)<br>
###########################################

In [None]:
df = load()
df.head()

In [None]:
df["NEW_AGE_PCLASS"] = df["Age"] * df["Pclass"]

In [None]:
df["NEW_FAMILY_SIZE"] = df["SibSp"] + df["Parch"] + 1

In [None]:
df.loc[(df['Sex'] == 'male') & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngmale'

In [None]:
df.loc[(df['Sex'] == 'male') & (df['Age'] > 21) & (df['Age'] < 50), 'NEW_SEX_CAT'] = 'maturemale'

In [None]:
df.loc[(df['Sex'] == 'male') & (df['Age'] >= 50), 'NEW_SEX_CAT'] = 'seniormale'

In [None]:
df.loc[(df['Sex'] == 'female') & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngfemale'

In [None]:
df.loc[(df['Sex'] == 'female') & (df['Age'] > 21) & (df['Age'] < 50), 'NEW_SEX_CAT'] = 'maturefemale'

In [None]:
df.loc[(df['Sex'] == 'female') & (df['Age'] >= 50), 'NEW_SEX_CAT'] = 'seniorfemale'

In [None]:
df.head()

In [None]:
df.groupby("NEW_SEX_CAT")["Survived"].mean()

###########################################<br>
Titanic UÃ§tan Uca Feature Engineering & Data Preprocessing<br>
###########################################

In [None]:
df = load()
df.shape
df.head()

In [None]:
df.columns = [col.upper() for col in df.columns]

###########################################<br>
1. Feature Engineering (DeÄŸiÅŸken MÃ¼hendisliÄŸi)<br>
###########################################

Cabin bool

In [None]:
df["NEW_CABIN_BOOL"] = df["CABIN"].notnull().astype('int')
# Name count
df["NEW_NAME_COUNT"] = df["NAME"].str.len()
# name word count
df["NEW_NAME_WORD_COUNT"] = df["NAME"].apply(lambda x: len(str(x).split(" ")))
# name dr
df["NEW_NAME_DR"] = df["NAME"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))
# name title
df['NEW_TITLE'] = df.NAME.str.extract(' ([A-Za-z]+)\.', expand=False)
# family size
df["NEW_FAMILY_SIZE"] = df["SIBSP"] + df["PARCH"] + 1
# age_pclass
df["NEW_AGE_PCLASS"] = df["AGE"] * df["PCLASS"]
# is alone
df.loc[((df['SIBSP'] + df['PARCH']) > 0), "NEW_IS_ALONE"] = "NO"
df.loc[((df['SIBSP'] + df['PARCH']) == 0), "NEW_IS_ALONE"] = "YES"
# age level
df.loc[(df['AGE'] < 18), 'NEW_AGE_CAT'] = 'young'
df.loc[(df['AGE'] >= 18) & (df['AGE'] < 56), 'NEW_AGE_CAT'] = 'mature'
df.loc[(df['AGE'] >= 56), 'NEW_AGE_CAT'] = 'senior'
# sex x age
df.loc[(df['SEX'] == 'male') & (df['AGE'] <= 21), 'NEW_SEX_CAT'] = 'youngmale'
df.loc[(df['SEX'] == 'male') & (df['AGE'] > 21) & (df['AGE'] < 50), 'NEW_SEX_CAT'] = 'maturemale'
df.loc[(df['SEX'] == 'male') & (df['AGE'] >= 50), 'NEW_SEX_CAT'] = 'seniormale'
df.loc[(df['SEX'] == 'female') & (df['AGE'] <= 21), 'NEW_SEX_CAT'] = 'youngfemale'
df.loc[(df['SEX'] == 'female') & (df['AGE'] > 21) & (df['AGE'] < 50), 'NEW_SEX_CAT'] = 'maturefemale'
df.loc[(df['SEX'] == 'female') & (df['AGE'] >= 50), 'NEW_SEX_CAT'] = 'seniorfemale'

In [None]:
df.head()
df.shape

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
num_cols = [col for col in num_cols if "PASSENGERID" not in col]

###########################################<br>
2. Outliers (AykÄ±rÄ± DeÄŸerler)<br>
###########################################

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
for col in num_cols:
    replace_with_thresholds(df, col)

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

###########################################<br>
3. Missing Values (Eksik DeÄŸerler)<br>
###########################################

In [None]:
missing_values_table(df)

In [None]:
df.drop("CABIN", inplace=True, axis=1)

In [None]:
remove_cols = ["TICKET", "NAME"]
df.drop(remove_cols, inplace=True, axis=1)

In [None]:
df["AGE"] = df["AGE"].fillna(df.groupby("NEW_TITLE")["AGE"].transform("median"))

In [None]:
df["NEW_AGE_PCLASS"] = df["AGE"] * df["PCLASS"]

age level

In [None]:
df.loc[(df['AGE'] < 18), 'NEW_AGE_CAT'] = 'young'
df.loc[(df['AGE'] >= 18) & (df['AGE'] < 56), 'NEW_AGE_CAT'] = 'mature'
df.loc[(df['AGE'] >= 56), 'NEW_AGE_CAT'] = 'senior'
# sex x age
df.loc[(df['SEX'] == 'male') & (df['AGE'] <= 21), 'NEW_SEX_CAT'] = 'youngmale'
df.loc[(df['SEX'] == 'male') & (df['AGE'] > 21) & (df['AGE'] < 50), 'NEW_SEX_CAT'] = 'maturemale'
df.loc[(df['SEX'] == 'male') & (df['AGE'] >= 50), 'NEW_SEX_CAT'] = 'seniormale'
df.loc[(df['SEX'] == 'female') & (df['AGE'] <= 21), 'NEW_SEX_CAT'] = 'youngfemale'
df.loc[(df['SEX'] == 'female') & (df['AGE'] > 21) & (df['AGE'] < 50), 'NEW_SEX_CAT'] = 'maturefemale'
df.loc[(df['SEX'] == 'female') & (df['AGE'] >= 50), 'NEW_SEX_CAT'] = 'seniorfemale'

In [None]:
df = df.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= 10) else x, axis=0)

###########################################<br>
4. Label Encoding<br>
###########################################

In [None]:
binary_cols = [col for col in df.columns if df[col].dtype not in [int, float]
               and df[col].nunique() == 2]

In [None]:
for col in binary_cols:
    df = label_encoder(df, col)

###########################################<br>
5. Rare Encoding<br>
###########################################

In [None]:
rare_analyser(df, "SURVIVED", cat_cols)

In [None]:
df = rare_encoder(df, 0.01)

In [None]:
df["NEW_TITLE"].value_counts()

###########################################<br>
6. One-Hot Encoding<br>
###########################################

In [None]:
ohe_cols = [col for col in df.columns if 10 >= df[col].nunique() > 2]

In [None]:
df = one_hot_encoder(df, ohe_cols)

In [None]:
df.head()
df.shape

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
num_cols = [col for col in num_cols if "PASSENGERID" not in col]

In [None]:
rare_analyser(df, "SURVIVED", cat_cols)

In [None]:
useless_cols = [col for col in df.columns if df[col].nunique() == 2 and
                (df[col].value_counts() / len(df) < 0.01).any(axis=None)]

df.drop(useless_cols, axis=1, inplace=True)

###########################################<br>
7. Standart Scaler<br>
###########################################

In [None]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
df[num_cols].head()

In [None]:
df.head()
df.shape

###########################################<br>
8. Model<br>
###########################################

In [None]:
y = df["SURVIVED"]
X = df.drop(["PASSENGERID", "SURVIVED"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)

###########################################<br>
HiÃ§ bir iÅŸlem yapÄ±lmadan elde edilecek skor?<br>
###########################################

In [None]:
dff = load()
dff.dropna(inplace=True)
dff = pd.get_dummies(dff, columns=["Sex", "Embarked"], drop_first=True)
y = dff["Survived"]
X = dff.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)

Yeni Ã¼rettiÄŸimiz deÄŸiÅŸkenler ne alemde?

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                      ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')

In [None]:
plot_importance(rf_model, X_train, block=True)