# 0. setup

In [1]:
import data
import config
import pandas as pd
from copy import deepcopy
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import os
np.set_printoptions(precision=4)
import catboost
print(catboost.__version__)

1.2.5


In [2]:
data.make_prepped_csv(config.get_base_config())

Making prepped data csvs
Data prep pipe:
 (0) RowRemoverByFeatureValue[discharge_disposition_id] -> (1) RowRemoverByFeatureValue[gender]
 -> (2) RowRemoverByDuplicates[patient_nbr] -> (3) AddFeatureByNormalizing -> (4) AddFeatureBySumming
 -> (5) AddFeatureByCounting -> (6) AddFeatureBySumming[nnz] -> (7) AddFeatureAverageAge
 -> (8) AddFeatureEncounter -> (9) CategoryReducer[readmitted] -> (10) CategoryReducer[age]
 -> (11) CategoryReducer[admission_type_id] -> (12) CategoryReducer[discharge_disposition_id]
 -> (13) CategoryGroupOthers -> (14) ICDConverter -> (15) PropertySetter -> (16) TargetSeparator

Done. Saved to: C:\Users\dekel\Desktop\ml_project\240418\ml-2024\data\prepped


In [3]:
prossesed_data = pd.read_csv('C://Users/dekel/Desktop/ml_project/240418/ml-2024/data/prepped/prepped_data 0c84a8 NonStandardized.csv', index_col=0)

In [4]:
prossesed_data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,num_medications_perDay,num_visits,num_nonEm_visits,biguanides_and_related,sulfonylureas_and_meglitinides,thiazolidinediones_and_miscellaneous,num_med_groups,age_avg,encounter,readmitted
0,Caucasian,Female,<30,Other,Other,1,1.0,Other,41.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,,NO
1,Caucasian,Female,<30,HighP,Home,7,3.0,Other,59.0,0.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,,NO
2,AfricanAmerican,Female,<30,HighP,Home,7,2.0,Other,11.0,5.0,...,6.5,3.0,3.0,0.0,1.0,0.0,1.0,25.0,,NO
3,Caucasian,Male,30-60,HighP,Home,7,2.0,Other,44.0,1.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,,NO
4,Caucasian,Male,30-60,HighP,Home,7,1.0,Other,51.0,0.0,...,8.0,0.0,0.0,0.0,1.0,0.0,1.0,45.0,,NO


In [5]:
prossesed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69987 entries, 0 to 69986
Data columns (total 33 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   race                                  69987 non-null  object 
 1   gender                                69987 non-null  object 
 2   age                                   69987 non-null  object 
 3   admission_type_id                     69987 non-null  object 
 4   discharge_disposition_id              69987 non-null  object 
 5   admission_source_id                   69987 non-null  int64  
 6   time_in_hospital                      69987 non-null  float64
 7   medical_specialty                     69987 non-null  object 
 8   num_lab_procedures                    69987 non-null  float64
 9   num_procedures                        69987 non-null  float64
 10  num_medications                       69987 non-null  float64
 11  number_outpatient   

In [6]:
prossesed_data = prossesed_data.fillna('missing')
non_integer_columns = prossesed_data.drop(columns=['readmitted']).select_dtypes(exclude=['int', 'float']).columns
id_diag_columns = [col for col in prossesed_data.columns if (('id' in col) or ('diag_' in col))]
# Combine both sets of columns
categorical_columns = list(set().union(non_integer_columns, id_diag_columns))
categorical_columns

['diag_1',
 'race',
 'diag_2',
 'thiazolidinediones_and_miscellaneous',
 'admission_source_id',
 'diag_3',
 'age',
 'diabetesMed',
 'sulfonylureas_and_meglitinides',
 'medical_specialty',
 'gender',
 'admission_type_id',
 'max_glu_serum',
 'encounter',
 'biguanides_and_related',
 'discharge_disposition_id',
 'insulin']

In [7]:
prossesed_data[categorical_columns] = prossesed_data[categorical_columns].astype('str')
prossesed_data[categorical_columns] = prossesed_data[categorical_columns].astype('category')

split to train and test:

In [8]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(prossesed_data, test_size=0.2, random_state=42, stratify=prossesed_data['readmitted'])
train_set_label = deepcopy(train_set["readmitted"])
test_set_label = deepcopy(test_set["readmitted"])
train_set.drop(columns=['readmitted'], inplace=True)
test_set.drop(columns=['readmitted'], inplace=True)

In [9]:
train_set_label.value_counts() / len(train_set)

readmitted
NO     0.910197
YES    0.089803
Name: count, dtype: float64

In [10]:
test_set_label.value_counts() / len(test_set)

readmitted
NO     0.910201
YES    0.089799
Name: count, dtype: float64

In [11]:
from catboost import Pool
train_pool = Pool(data=train_set, label=train_set_label, cat_features=categorical_columns)

cross validation to find best par

In [12]:
from catboost import CatBoost


In [14]:

params = {
    'loss_function': 'Logloss',
    'iterations': 1000,
    'custom_loss': 'AUC',
    'early_stopping_rounds': 50,
}


model = CatBoost(params)

grid = {'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.3],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
    'iterations': [1000],
    'custom_loss': ['AUC'],
    'early_stopping_rounds': [10,30,90,120,200]
       }



grid_cv =grid_search_result = model.grid_search(grid,
                                       X=train_pool,
                                       cv=6,
                                       partition_random_seed=41,
                                       plot=True,
                                  )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6834791	test: 0.6834357	best: 0.6834357 (0)	total: 317ms	remaining: 5m 16s
1:	learn: 0.6742185	test: 0.6741535	best: 0.6741535 (1)	total: 434ms	remaining: 3m 36s
2:	learn: 0.6651222	test: 0.6650451	best: 0.6650451 (2)	total: 580ms	remaining: 3m 12s
3:	learn: 0.6561382	test: 0.6560178	best: 0.6560178 (3)	total: 690ms	remaining: 2m 51s
4:	learn: 0.6475890	test: 0.6474341	best: 0.6474341 (4)	total: 795ms	remaining: 2m 38s
5:	learn: 0.6392373	test: 0.6390463	best: 0.6390463 (5)	total: 923ms	remaining: 2m 32s
6:	learn: 0.6308858	test: 0.6306522	best: 0.6306522 (6)	total: 1.03s	remaining: 2m 26s
7:	learn: 0.6226993	test: 0.6224204	best: 0.6224204 (7)	total: 1.13s	remaining: 2m 20s
8:	learn: 0.6149237	test: 0.6146139	best: 0.6146139 (8)	total: 1.22s	remaining: 2m 14s
9:	learn: 0.6071747	test: 0.6068241	best: 0.6068241 (9)	total: 1.32s	remaining: 2m 11s
10:	learn: 0.5996106	test: 0.5992573	best: 0.5992573 (10)	total: 1.42s	remaining: 2m 7s
11:	learn: 0.5923741	test: 0.5919847	best:

KeyboardInterrupt: 

In [None]:

train_data = np.random.randint(1, 100, size=(100, 10))
train_labels = np.random.randint(2, size=(100))

params = {
    'loss_function': 'Logloss',
    'iterations': 1000,
    'custom_loss': 'AUC',
    'early_stopping_rounds': 50
    'snapshot_interval': 50
}


model = CatBoost(params)

grid = {'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.3],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
       
    'iterations': [1000],
    'custom_loss': ['AUC'],
    'early_stopping_rounds': [10,30,90,120,200]
       }



grid_cv =grid_search_result = model.grid_search(grid,
                                       X=train_pool,
                                       cv=6,
                                       partition_random_seed=41,
                                       plot=True,
                                  )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6834791	test: 0.6834357	best: 0.6834357 (0)	total: 115ms	remaining: 57.4s
1:	learn: 0.6742185	test: 0.6741535	best: 0.6741535 (1)	total: 229ms	remaining: 57s
2:	learn: 0.6651222	test: 0.6650451	best: 0.6650451 (2)	total: 328ms	remaining: 54.4s
3:	learn: 0.6561382	test: 0.6560178	best: 0.6560178 (3)	total: 430ms	remaining: 53.3s
4:	learn: 0.6475890	test: 0.6474341	best: 0.6474341 (4)	total: 522ms	remaining: 51.7s
5:	learn: 0.6392373	test: 0.6390463	best: 0.6390463 (5)	total: 623ms	remaining: 51.3s
6:	learn: 0.6308858	test: 0.6306522	best: 0.6306522 (6)	total: 728ms	remaining: 51.3s
7:	learn: 0.6226993	test: 0.6224204	best: 0.6224204 (7)	total: 835ms	remaining: 51.3s
8:	learn: 0.6149237	test: 0.6146139	best: 0.6146139 (8)	total: 957ms	remaining: 52.2s
9:	learn: 0.6071747	test: 0.6068241	best: 0.6068241 (9)	total: 1.05s	remaining: 51.4s
10:	learn: 0.5996106	test: 0.5992573	best: 0.5992573 (10)	total: 1.15s	remaining: 50.9s
11:	learn: 0.5923741	test: 0.5919847	best: 0.5919847 (

In [13]:

params = {
    'loss_function': 'Logloss',
    'iterations': 1000,
    'custom_loss': 'AUC',
    'logging_level': 'Silent'
}


model = CatBoost(params)


param_grid = {

    
    'learning_rate': [0.04, 0.05, 0.06, 0.07],
    'n_estimators': [100, 200, 300],
    'depth': [3, 5, 7],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.7, 0.8, 0.9, 1.0],
    'bagging_temperature': [0.0, 0.5, 1.0],
    'random_strength': [0.0, 0.5, 1.0],
    'min_child_samples': [1, 5, 10],
    'early_stopping_rounds': [30, 120, 200],
    'leaf_estimation_method': ['Newton', 'Gradient', 'Exact'],
    'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS', 'Poisson'],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],
    'gradient_estimation_method': ['Newton', 'Exact', 'SteepDesc', 'Armijo'],
    'random_seed': [42]  # Fixed random seed for reproducibility
}


grid = {'learning_rate': [0.04, 0.05, 0.06, 0.07],
        'depth': [3, 5, 7],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'iterations': [1000],
        'custom_loss': ['AUC'],
        'early_stopping_rounds': [30, 120, 200],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.7, 0.8, 0.9, 1.0],
        'bagging_temperature': [0.0, 0.5, 1.0],
        'random_strength': [0.0, 0.5, 1.0],
        'min_child_samples': [1, 5, 10],
         'leaf_estimation_method': ['Newton', 'Gradient', 'Exact'],
        'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],
        'random_seed': [42],
       }


grid_cv =grid_search_result = model.grid_search(grid,
                                       X=train_pool,
                                       cv=6,
                                       partition_random_seed=41,
                                       plot=True,
                                       verbose=False
                                  )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate


KeyboardInterrupt



In [None]:

params = {
    'loss_function': 'Logloss',
    'iterations': 1000,
    'custom_loss': 'AUC',
    'logging_level': 'Silent'
}


model = CatBoost(params)


param_grid = {

    
    'learning_rate': [0.04, 0.05, 0.06, 0.07],
    'n_estimators': [100, 200, 300],
    'depth': [3, 5, 7],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.7, 0.8, 0.9, 1.0],
    'bagging_temperature': [0.0, 0.5, 1.0],
    'random_strength': [0.0, 0.5, 1.0],
    'min_child_samples': [1, 5, 10],
    'early_stopping_rounds': [30, 120, 200],
    'leaf_estimation_method': ['Newton', 'Gradient', 'Exact'],
    'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS', 'Poisson'],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],
    'gradient_estimation_method': ['Newton', 'Exact', 'SteepDesc', 'Armijo'],
    'random_seed': [42]  # Fixed random seed for reproducibility
}


grid = {'learning_rate': [0.04, 0.05, 0.06, 0.07],
        'depth': [3, 5, 7],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'iterations': [1000],
        'custom_loss': ['AUC'],
        'early_stopping_rounds': [30, 120, 200],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.7, 0.8, 0.9, 1.0],
        'bagging_temperature': [0.0, 0.5, 1.0],
        'random_strength': [0.0, 0.5, 1.0],
        'min_child_samples': [1, 5, 10],
         
        'random_seed': [42],
       }


grid_cv =grid_search_result = model.grid_search(grid,
                                       X=train_pool,
                                       cv=6,
                                       partition_random_seed=41,
                                       plot=True,
                                       verbose=False
                                  )

In [14]:
grid_cv


NameError: name 'grid_cv' is not defined

In [None]:
model.

In [None]:
need to add class_weights

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [23]:
# parameter yuning using cross validation:

In [None]:
from catboost import cv

params = {
    'loss_function': 'Logloss',
    'iterations': 80,
    'custom_loss': 'AUC',
    'learning_rate': 0.5,
}

cv_data = cv(
    params = params,
    pool = train_pool,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    verbose=False
)

In [9]:
from catboost import CatBoostClassifier

In [14]:
prossesed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69987 entries, 0 to 69986
Data columns (total 33 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   race                                  69987 non-null  object 
 1   gender                                69987 non-null  object 
 2   age                                   69987 non-null  object 
 3   admission_type_id                     69987 non-null  object 
 4   discharge_disposition_id              69987 non-null  object 
 5   admission_source_id                   69987 non-null  int64  
 6   time_in_hospital                      69987 non-null  float64
 7   medical_specialty                     69987 non-null  object 
 8   num_lab_procedures                    69987 non-null  float64
 9   num_procedures                        69987 non-null  float64
 10  num_medications                       69987 non-null  float64
 11  number_outpatient   

In [20]:
non_integer_columns = train_set.select_dtypes(exclude=['int', 'float']).columns
id_columns = [col for col in train_set.columns if (('id' in col) or ('diag' in col))]

# Combine both sets of columns
categorical_columns = list(set().union(non_integer_columns, id_columns))


In [41]:
categorical_columns

['admission_type_id',
 'medical_specialty',
 'encounter',
 'diag_2',
 'thiazolidinediones_and_miscellaneous',
 'max_glu_serum',
 'diag_1',
 'age',
 'gender',
 'diag_3',
 'admission_source_id',
 'number_diagnoses',
 'discharge_disposition_id',
 'biguanides_and_related',
 'sulfonylureas_and_meglitinides',
 'insulin',
 'diabetesMed',
 'race']

In [42]:
train_set[categorical_columns] = train_set[categorical_columns].astype('category')

In [33]:
encoder = OneHotEncoder()
one_hot_encoded  = encoder.fit_transform(train_set[categorical_columns])


# one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded DataFrame with the original DataFrame
# df_encoded = pd.concat([train_set, one_hot_df], axis=1)

# Drop the original categorical columns
# df_encoded.drop(categorical_columns, axis=1, inplace=True)



In [40]:
one_hot_encoded.()

TypeError: spmatrix.getcol() missing 1 required positional argument: 'j'

In [32]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Sample DataFrame
df = train_set

# Identify non-integer columns and columns with "id" in their names
non_integer_columns = df.select_dtypes(exclude=['int', 'float']).columns
id_columns = [col for col in df.columns if 'id' in col]

# Combine both sets of columns
categorical_columns = list(set().union(non_integer_columns, id_columns))

# Convert identified columns to categorical data type
df[categorical_columns] = df[categorical_columns].astype('category')

# Apply one-hot encoding to all categorical columns
encoder = OneHotEncoder()
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

# Get the feature names after one-hot encoding
feature_names = encoder.get_feature_names_out(categorical_columns)

# Create DataFrame from one-hot encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=feature_names)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df, one_hot_df], axis=1)

# Drop the original categorical columns
df_encoded.drop(categorical_columns, axis=1, inplace=True)

print(df_encoded)


ValueError: Shape of passed values is (55989, 1), indices imply (55989, 136)

In [45]:
categorical_columns

['admission_type_id',
 'medical_specialty',
 'encounter',
 'diag_2',
 'thiazolidinediones_and_miscellaneous',
 'max_glu_serum',
 'diag_1',
 'age',
 'gender',
 'diag_3',
 'admission_source_id',
 'number_diagnoses',
 'discharge_disposition_id',
 'biguanides_and_related',
 'sulfonylureas_and_meglitinides',
 'insulin',
 'diabetesMed',
 'race']

In [47]:
# Check data types of categorical columns
print(train_set[categorical_columns].dtypes)

# Convert non-integer categorical columns to string
non_integer_cols = train_set[categorical_columns].select_dtypes(exclude=['int']).columns
train_set[non_integer_cols] = train_set[non_integer_cols].astype(str)

# Convert non-string categorical columns to integer
non_string_cols = train_set[categorical_columns].select_dtypes(exclude=['object']).columns
train_set[non_string_cols] = train_set[non_string_cols].astype(int)


admission_type_id                       category
medical_specialty                       category
encounter                               category
diag_2                                  category
thiazolidinediones_and_miscellaneous    category
max_glu_serum                           category
diag_1                                  category
age                                     category
gender                                  category
diag_3                                  category
admission_source_id                     category
number_diagnoses                        category
discharge_disposition_id                category
biguanides_and_related                  category
sulfonylureas_and_meglitinides          category
insulin                                 category
diabetesMed                             category
race                                    category
dtype: object


In [None]:
from catboost import cv

params = {
    'loss_function': 'Logloss',
    'iterations': 80,
    'custom_loss': 'AUC',
    'learning_rate': 0.5,
}

cv_data = cv(
    params = params,
    pool = train_pool,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    verbose=False
)

In [48]:
train_set[categorical_columns] = train_set[categorical_columns].astype('category')
model = CatBoostClassifier()
model.fit(train_set, train_set_label, cat_features=categorical_columns)

Learning rate set to 0.057461
0:	learn: 0.6404137	total: 164ms	remaining: 2m 43s
1:	learn: 0.5946235	total: 347ms	remaining: 2m 52s
2:	learn: 0.5541384	total: 511ms	remaining: 2m 49s
3:	learn: 0.5193406	total: 697ms	remaining: 2m 53s
4:	learn: 0.4899367	total: 895ms	remaining: 2m 58s
5:	learn: 0.4641250	total: 1.07s	remaining: 2m 57s
6:	learn: 0.4417115	total: 1.24s	remaining: 2m 56s
7:	learn: 0.4231139	total: 1.42s	remaining: 2m 55s
8:	learn: 0.4069527	total: 1.57s	remaining: 2m 53s
9:	learn: 0.3925132	total: 1.73s	remaining: 2m 51s
10:	learn: 0.3805383	total: 1.8s	remaining: 2m 42s
11:	learn: 0.3692692	total: 1.96s	remaining: 2m 41s
12:	learn: 0.3597659	total: 2.12s	remaining: 2m 40s
13:	learn: 0.3515603	total: 2.28s	remaining: 2m 40s
14:	learn: 0.3443924	total: 2.43s	remaining: 2m 39s
15:	learn: 0.3383615	total: 2.59s	remaining: 2m 39s
16:	learn: 0.3329127	total: 2.73s	remaining: 2m 37s
17:	learn: 0.3282390	total: 2.83s	remaining: 2m 34s
18:	learn: 0.3242977	total: 2.98s	remaining: 

<catboost.core.CatBoostClassifier at 0x24f9ec2a440>

In [50]:
y = model.predict(train_set)

In [51]:
train_set_label

22170     NO
49921     NO
17294     NO
49453     NO
43780     NO
        ... 
17748     NO
18143     NO
37369    YES
8028      NO
13449     NO
Name: readmitted, Length: 55989, dtype: object

In [52]:
y

array(['NO', 'NO', 'NO', ..., 'NO', 'NO', 'NO'], dtype=object)