In [1]:
from utils import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

fe = Feature_engineering()
woe = WoE()
corr_methods = Correlation_methods()
plot_phik = Phik_matrix()

In [2]:
df = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
id_col = test['id']

df = fe.feature_engineering(df=df)
df = woe.features_with_woe(df=df)
test = fe.feature_engineering(df=test)
test = woe.features_with_woe(df=test)
cols_to_drop = fe.features_to_drop(df=df, target='smoking', threshold=0.15)
df = df.drop(columns=cols_to_drop)
test = test.drop(columns=cols_to_drop)
test['lfs_cat'] = test['lfs_cat'].fillna(1)


print("train size:", df.shape)
print("test size:", test.shape)


X = df.drop(columns=['smoking'])
y = df['smoking']
seed = 1
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, stratify=y)

train size: (15000, 29)
test size: (10000, 28)


In [6]:
# categoric columnlarni scale qilmaslik uchun
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), 
        np.arange(15))],
    remainder='passthrough'  # Keeps the last 3 features unchanged
)


pipeline_logis = Pipeline([
    ('scaler', preprocessor),  
    ('logistic', LogisticRegression())
])
pipeline_dt = Pipeline([
    ('scaler', preprocessor),  
    ('decisiontree', DecisionTreeClassifier(random_state=seed,
                                            min_samples_split = 2, 
                                            min_samples_leaf = 10,
                                            max_features = 'sqrt', 
                                            max_depth = 10))
])
pipeline_bc = Pipeline([
    ('bagging', BaggingClassifier(DecisionTreeClassifier(random_state=seed, max_depth=10),
                                  n_estimators=199, 
                                  random_state=seed, 
                                  max_samples=.85))
])
pipeline_rf = Pipeline([
    ('randomForest', RandomForestClassifier(n_estimators= 199,
                                            min_samples_split= 20,
                                            min_samples_leaf= 5,
                                            max_features= 'sqrt',
                                            max_depth= 20,
                                            bootstrap=True,
                                            random_state=seed))
])
pipeline_svc = Pipeline([
    ('scaler', preprocessor),
    ('svc', SVC(random_state=seed, probability=True))
])

estimators = [
    ('logistic', pipeline_logis),
    ('decisionTree', pipeline_dt),
    ('bagging', pipeline_bc),
    ('randomForest', pipeline_rf),
    # ('svc', pipeline_svc)
]

In [5]:
stacking = StackingClassifier(estimators=estimators,
                              final_estimator=RandomForestClassifier(n_estimators= 400,
                                                                    min_samples_split= 20,
                                                                    min_samples_leaf= 5,
                                                                    max_features= 'sqrt',
                                                                    max_depth= 20,
                                                                    bootstrap=True,
                                                                    random_state=seed),
                              cv=skf,
                              passthrough=True,
                              n_jobs=-1
)

cv_scores = cross_val_score(stacking, X, y, cv=skf, scoring='roc_auc')

print(cv_scores)
print('STD:', cv_scores.std())
print('Mean ROC-AUC:', np.mean(cv_scores))

[0.87852932 0.88756322 0.87994279 0.88658259 0.8890092 ]
STD: 0.004250133887023619
Mean ROC-AUC: 0.8843254266244402


# prediction for kaggle comp

In [7]:
stacking = StackingClassifier(estimators=estimators,
                              final_estimator=RandomForestClassifier(n_estimators= 400,
                                                                    min_samples_split= 20,
                                                                    min_samples_leaf= 5,
                                                                    max_features= 'sqrt',
                                                                    max_depth= 20,
                                                                    bootstrap=True,
                                                                    random_state=seed),
                              cv=skf,
                              passthrough=True,
                              n_jobs=-1
)
stacking.fit(X, y)
y_pred = stacking.predict_proba(test)[:, 1]

# pd.DataFrame({'id':id_col, 'smoking':y_pred}).to_csv('predictions.csv', index=False)