#### Veri Setini hazırlama

In [2]:
import pandas as pd

adult_census = pd.read_csv("dataset/adult-census.csv")
# drop the duplicated column `"education-num"` as stated in the first notebook
adult_census = adult_census.drop(columns="education-num")

target_name = "class"
target = adult_census[target_name]

data = adult_census.drop(columns=[target_name])

#### Selector ile kategorik ve nümerik kolonları seçme

In [3]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

#### Preprocessorleri tanımlama

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

#### Transformerları aktarmak için ColumnTransformer tanımlama

In [5]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

#### ColumnTransformer ile modeli bir pipeline'da birleştirme

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

#### Veri setini test-train olarak ayırma

In [7]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

#### Modeli fit etme

In [8]:
_ = model.fit(data_train, target_train)

#### Fit sonrası test datasına bakış

In [9]:
data_test.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
7762,56,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,0,0,40,United-States
23881,25,Private,HS-grad,Married-civ-spouse,Transport-moving,Own-child,Other,Male,0,0,40,United-States
30507,43,Private,Bachelors,Divorced,Prof-specialty,Not-in-family,White,Female,14344,0,40,United-States
28911,32,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States
19484,39,Private,Bachelors,Married-civ-spouse,Sales,Wife,White,Female,0,0,30,United-States


#### Predictionlara bakma

In [10]:
model.predict(data_test)[:5]

array([' <=50K', ' <=50K', ' >50K', ' <=50K', ' >50K'], dtype=object)

#### Test inceleme

In [13]:
target_test[:5]

  target_test[:5]


7762      <=50K
23881     <=50K
30507      >50K
28911     <=50K
19484     <=50K
Name: class, dtype: object

#### Model Skoru

In [12]:
model.score(data_test, target_test)

0.8575874211776268

#### cross_validate tanımlama

In [14]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=5)
cv_results

{'fit_time': array([0.94145107, 1.03205895, 1.01401353, 1.02163792, 1.04094672]),
 'score_time': array([0.03847623, 0.03403664, 0.03621364, 0.03948236, 0.03357983]),
 'test_score': array([0.8512642 , 0.8498311 , 0.84756347, 0.85247748, 0.85524161])}

#### Ortalama skor (arraylerin ortalaması)

In [15]:
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")

The mean cross-validation accuracy is: 0.851 ± 0.003


#### Her şeyi pipelineda birleştirme (HistGradientBoostingClass ile)

Tree based model kullandığımız için StandartScaler'e gerek yok.

In [16]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

#### Fitting

In [17]:
_ = model.fit(data_train, target_train)

#### Model başarısı

In [19]:
model.score(data_test, target_test)

0.8802718860044222