# 2022-12-21
## XGBoost
https://xgboost.readthedocs.io/en/stable/python/python_intro.html

In [147]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.datasets import load_diabetes
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, truncnorm, randint
from xgboost import XGBRegressor, XGBClassifier



from sklearn.preprocessing import StandardScaler

from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression

from typing import List, Dict, Any

import pandas as pd

In [3]:
# Load data
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
print(X.shape)

(442, 10)


In [4]:
# splitting data

x_train, x_test, y_train, y_test = train_test_split(X, y)

In [5]:
# training Decision Tree with default params

clf = DecisionTreeRegressor()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(mean_squared_error(y_test, y_pred))

6616.7657657657655


In [7]:
# training GB with default params

clf = GradientBoostingRegressor()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(mean_squared_error(y_test, y_pred))

3697.3184875870847


In [6]:
# training XGBoost with default params

clf = XGBRegressor()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(mean_squared_error(y_test, y_pred))

4261.337344412924


In [11]:
params = {
    "max_depth": randint(1, 100),
    "max_leaves": randint(1, 100),
    "n_estimators": randint(5, 500),
    "gamma": uniform(1, 9),
    'reg_lambda' : uniform(0,1),
}


forrest = XGBRegressor()

search = RandomizedSearchCV(forrest, params, n_iter=100, cv=5, random_state=42, scoring="neg_mean_squared_error", n_jobs=-1)

search.fit(x_train, y_train)

In [12]:
clf = XGBRegressor(**search.best_estimator_.get_params())
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(mean_squared_error(y_test, y_pred))

3412.06776192705


## Compare Tree Models on Titanic Data

Define preprocessing functions

In [16]:
def extract_name_info(full_name: str) -> str:
    sencod_half = full_name.split(",")[1]
    return sencod_half.split(".")[0].strip()

def engineer_features(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    df["ticket_is_expensive"] = df["Fare"].apply(lambda x: int(x > 50))
    df["is_alone"] = (df["SibSp"] + df["Parch"]).apply(lambda x: int(x == 0))

    df["is_child"] = 0
    df["is_adult"] = 0
    df["is_elder"] = 0
    df.loc[df["Age"] <= 14, "is_child"] = 1
    df.loc[(df["Age"] > 14) & (df["Age"] <= 55), "is_adult"] = 1
    df.loc[(df["Age"] > 55), "is_elder"] = 1
    return df

def process_age(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    df["name_info"] = df["Name"].apply(extract_name_info)

    df["Age"].fillna(df["Age"].mean())
    df.loc[df["name_info"] == "Miss"]["Age"] = df.loc[df["name_info"] == "Miss"]["Age"].mean()
    df.loc[df["name_info"] == "Mrs"]["Age"] = df.loc[df["name_info"] == "Mrs"]["Age"].mean()

    return df

def process_dataset(data: pd.DataFrame, label: str, categorical_features: List[str], continues_features: List[str]) -> pd.DataFrame:
    df = data.copy()
    df.drop(columns=list(set(df.columns) - set(continues_features + categorical_features + [label])), axis=1, inplace=True)
    for feature in categorical_features:
        dummies = pd.get_dummies(df[feature])
        df = pd.concat([df, dummies], axis=1)
        df.drop(columns=[feature], axis=1, inplace=True)
    return df

Load, preprocess and split data

In [17]:
data_df = pd.read_csv("./data/titanic_train.csv")

categorical_features=["Pclass", "Embarked", "Sex"]
continues_features=["ticket_is_expensive", "is_alone", "is_child", "is_adult", "is_elder"]
label = "Survived"

data_df = engineer_features(data_df)
data_df = process_age(data_df)
data_df = process_dataset(data_df, label=label, categorical_features=categorical_features, continues_features=continues_features)
data_df.dropna(inplace=True)

train_df, test_df = train_test_split(data_df, stratify=data_df[label].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df["name_info"] == "Miss"]["Age"] = df.loc[df["name_info"] == "Miss"]["Age"].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df["name_info"] == "Mrs"]["Age"] = df.loc[df["name_info"] == "Mrs"]["Age"].mean()


Extracting features and labels

In [23]:
features = list(data_df.columns)
features.remove(label)

x_train = train_df[features].values
x_test = test_df[features].values

y_train = train_df[label].values
y_test = test_df[label].values

In [24]:
# Let's try to model with LR

scaler = StandardScaler().fit(data_df[features].values)

lg = LogisticRegression()
lg.fit(scaler.transform(x_train), y_train)

predictions = lg.predict(scaler.transform(x_test))
print(f1_score(y_test, predictions))

0.6913580246913581


In [80]:
# Let's try to model with DecisionTree

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(f1_score(y_test, predictions))

0.7361963190184049


In [76]:
# Let's try to model with RandomForrest

model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(f1_score(y_test, predictions))

0.7439024390243903


In [78]:
# Let's try to model with AdaBoost

model = AdaBoostClassifier()
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(f1_score(y_test, predictions))

0.7108433734939759


In [91]:
# Let's try to model with GBoost

model = GradientBoostingClassifier()
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(f1_score(y_test, predictions))

0.7361963190184049


In [98]:
# Let's try to model with XGBoost

model = XGBClassifier()
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(f1_score(y_test, predictions))

0.7439024390243903


Let's fine-tune some of the models

In [103]:
params = {
    "min_samples_leaf": randint(1, 500),
    "max_depth": randint(10, 600),
    "min_samples_split": uniform(0.01, 0.199)
}

forrest = DecisionTreeClassifier()

search = RandomizedSearchCV(forrest, params, n_iter=100, cv=7, random_state=42)
search.fit(x_train, y_train)

model = DecisionTreeClassifier(**search.best_estimator_.get_params())
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(f1_score(y_test, predictions))

0.7295597484276729


In [105]:
# Searching for the best random forrest

params = {
    "n_estimators": randint(5, 500),
    "min_samples_leaf": randint(1, 500),
    "max_depth": randint(10, 300),
    "min_samples_split": uniform(0.01, 0.199)
}

forrest = RandomForestClassifier()

search = RandomizedSearchCV(forrest, params, n_iter=100, cv=7, random_state=42, n_jobs=-1)
search.fit(x_train, y_train)

model = RandomForestClassifier(**search.best_estimator_.get_params())
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(f1_score(y_test, predictions))

0.7295597484276729


In [136]:
# Searching for the best random forrest

params = {
    'n_estimators': range(8, 20),
    'max_depth': range(6, 10),
    'learning_rate': [.4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1]
}

forrest = XGBClassifier()

search = RandomizedSearchCV(forrest, params, n_iter=100, cv=7, random_state=42, n_jobs=-1)
search.fit(x_train, y_train)

model = XGBClassifier(**search.best_estimator_.get_params())
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(f1_score(y_test, predictions))

0.7361963190184049


In [137]:
search.best_estimator_.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 0.7,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.4,
 'max_bin': 256,
 'max_cat_threshold': 64,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 6,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 11,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

## House price dataset

In [108]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split

from typing import List, Optional

import eli5

import warnings

import sklearn

from sklearn.metrics import mean_absolute_error, mean_squared_error

import numpy as np

In [110]:
class ToBinaryFeaturesTransfromer(BaseEstimator, TransformerMixin):
    """ Transformer that transforms continous or categorical data into binary features if needed

    Args:
        BaseEstimator (_type_): _description_
        TransformerMixin (_type_): _description_
    """
    def __init__(self, features: List[str]) -> None:
        self.features = features

    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        data = X.copy()
        for feature in self.features:
            data[f"has_{feature.lower()}"] = data[feature].apply(lambda x: int(x > 0))
        data.drop(self.features, axis=1)
        return data

In [150]:
def extract_best_model_params(params: Dict[str, Any]):
    filtered_dict = {}
    for key, value in params.items():
        if "model__" in key:
            filtered_dict[key.split("model__")[-1]] = value
    return filtered_dict

In [155]:
data = pd.read_csv("./data/housing_price.csv")
train_df, test_df = train_test_split(data, random_state=42)

In [156]:
cont_features = ["GrLivArea", "YearBuilt", "OverallQual", "OverallCond", "YearRemodAdd", "has_totalbsmtsf", "has_garagearea", "has_poolarea"]
cat_features = ["Neighborhood", "HouseStyle"]
label = "SalePrice"

In [165]:
cont_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encode", OneHotEncoder(handle_unknown="ignore"))
])

In [158]:
pipeline = Pipeline([
    ("to_binary_features_transformer", ToBinaryFeaturesTransfromer(features=["GarageArea", "TotalBsmtSF", "PoolArea"])),
    ("ColumnTransformer", ColumnTransformer([
        ("cont_transformer", cont_pipeline, cont_features),
        ("cat_transformer", cat_pipeline, cat_features)
    ])),
    ("model", LinearRegression())
])

In [159]:
pipeline.fit(train_df, train_df["SalePrice"].values)
predictions = pipeline.predict(test_df)

print("MAE: ", mean_absolute_error(test_df[label], predictions))
print("MSE: ", mean_squared_error(test_df[label], predictions))

MAE:  20847.35548426374
MSE:  1146446451.2206194


In [160]:
cont_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encode", OneHotEncoder())
])

In [161]:
pipeline = Pipeline([
    ("to_binary_features_transformer", ToBinaryFeaturesTransfromer(features=["GarageArea", "TotalBsmtSF", "PoolArea"])),
    ("ColumnTransformer", ColumnTransformer([
        ("cont_transformer", cont_pipeline, cont_features),
        ("cat_transformer", cat_pipeline, cat_features)
    ])),
    ("model", DecisionTreeRegressor())
])

pipeline.fit(train_df, train_df["SalePrice"].values)
predictions = pipeline.predict(test_df)

print("MAE: ", mean_absolute_error(test_df[label], predictions))
print("MSE: ", mean_squared_error(test_df[label], predictions))

MAE:  26134.64383561644
MSE:  1467194498.9981735


In [162]:
pipeline = Pipeline([
    ("to_binary_features_transformer", ToBinaryFeaturesTransfromer(features=["GarageArea", "TotalBsmtSF", "PoolArea"])),
    ("ColumnTransformer", ColumnTransformer([
        ("cont_transformer", cont_pipeline, cont_features),
        ("cat_transformer", cat_pipeline, cat_features)
    ])),
    ("model", RandomForestRegressor())
])

pipeline.fit(train_df, train_df["SalePrice"].values)
predictions = pipeline.predict(test_df)

print("MAE: ", mean_absolute_error(test_df[label], predictions))
print("MSE: ", mean_squared_error(test_df[label], predictions))

MAE:  19526.429833855185
MSE:  905137971.4287437


In [163]:
pipeline = Pipeline([
    ("to_binary_features_transformer", ToBinaryFeaturesTransfromer(features=["GarageArea", "TotalBsmtSF", "PoolArea"])),
    ("ColumnTransformer", ColumnTransformer([
        ("cont_transformer", cont_pipeline, cont_features),
        ("cat_transformer", cat_pipeline, cat_features)
    ])),
    ("model", XGBRegressor())
])

pipeline.fit(train_df, train_df["SalePrice"].values)
predictions = pipeline.predict(test_df)

print("MAE: ", mean_absolute_error(test_df[label], predictions))
print("MSE: ", mean_squared_error(test_df[label], predictions))

MAE:  19453.482887414382
MSE:  895996821.589289


Searching for best hyperparams

In [166]:
params = {
    "model__min_samples_leaf": randint(1, 500),
    "model__max_depth": randint(10, 600),
    "model__min_samples_split": uniform(0.01, 0.199)
}

forrest = Pipeline([
    ("to_binary_features_transformer", ToBinaryFeaturesTransfromer(features=["GarageArea", "TotalBsmtSF", "PoolArea"])),
    ("ColumnTransformer", ColumnTransformer([
        ("cont_transformer", cont_pipeline, cont_features),
        ("cat_transformer", cat_pipeline, cat_features)
    ])),
    ("model", DecisionTreeRegressor())
])

search = RandomizedSearchCV(forrest, params, n_iter=100, cv=7, random_state=42)
search.fit(train_df, train_df["SalePrice"].values)

pipeline = Pipeline([
    ("to_binary_features_transformer", ToBinaryFeaturesTransfromer(features=["GarageArea", "TotalBsmtSF", "PoolArea"])),
    ("ColumnTransformer", ColumnTransformer([
        ("cont_transformer", cont_pipeline, cont_features),
        ("cat_transformer", cat_pipeline, cat_features)
    ])),
    ("model", DecisionTreeRegressor(**extract_best_model_params(search.best_estimator_.get_params())))
])
pipeline.fit(train_df, train_df["SalePrice"].values)

predictions = pipeline.predict(test_df)

print("MAE: ", mean_absolute_error(test_df[label], predictions))
print("MSE: ", mean_squared_error(test_df[label], predictions))


MAE:  25516.887432574957
MSE:  1689124942.3577824


In [169]:
params = {
    'model__max_depth': [6,10],
    'model__learning_rate': [0.01, 0.08],
    'model__colsample_bylevel': [0.3, 0.4],
}

forrest = Pipeline([
    ("to_binary_features_transformer", ToBinaryFeaturesTransfromer(features=["GarageArea", "TotalBsmtSF", "PoolArea"])),
    ("ColumnTransformer", ColumnTransformer([
        ("cont_transformer", cont_pipeline, cont_features),
        ("cat_transformer", cat_pipeline, cat_features)
    ])),
    ("model", XGBRegressor())
])

search = RandomizedSearchCV(forrest, params, n_iter=100, cv=7, random_state=42, n_jobs=-1)
search.fit(train_df, train_df["SalePrice"].values)

pipeline = Pipeline([
    ("to_binary_features_transformer", ToBinaryFeaturesTransfromer(features=["GarageArea", "TotalBsmtSF", "PoolArea"])),
    ("ColumnTransformer", ColumnTransformer([
        ("cont_transformer", cont_pipeline, cont_features),
        ("cat_transformer", cat_pipeline, cat_features)
    ])),
    ("model", XGBRegressor(**extract_best_model_params(search.best_estimator_.get_params())))
])
pipeline.fit(train_df, train_df["SalePrice"].values)

predictions = pipeline.predict(test_df)

print("MAE: ", mean_absolute_error(test_df[label], predictions))
print("MSE: ", mean_squared_error(test_df[label], predictions))




MAE:  18047.84691780822
MSE:  758156982.429825
