In [198]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.metrics import accuracy_score,root_mean_squared_error,r2_score,confusion_matrix,classification_report
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer,OneHotEncoder,OrdinalEncoder,LabelEncoder,StandardScaler,MinMaxScaler,RobustScaler
from torch.distributed.pipelining import pipeline
from xgboost import XGBRegressor,XGBClassifier
from sklearn.cluster import KMeans,DBSCAN
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import ElasticNet, LogisticRegression


In [41]:
df=sns.load_dataset('titanic')
df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [43]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [55]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Transformations

In [104]:
X,y=df.drop(['survived','alive','embark_town'],axis=1),df['survived']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,stratify=y,random_state=42)
X.columns

Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class',
       'who', 'adult_male', 'deck', 'alone'],
      dtype='object')

In [106]:
X['who'].unique()

array(['man', 'woman', 'child'], dtype=object)

In [137]:
categorical_col=['sex','embarked','who','class','deck','alone','adult_male']
nominal_col=['sex','deck','embarked','alone','adult_male']
ordinal_col=['class','who']
class_order=['Third','Second',"First"]
who_order=['man','woman','child']

ordinal_order=[class_order,who_order]

In [138]:
def imputer(X):
    imputer_df=X.copy()
    imputer_df['age']=imputer_df.groupby(['pclass','who'])['age'].transform(lambda x:x.fillna(x.mean()) )
    return imputer_df

In [200]:
trns1=ColumnTransformer(transformers=[
    ('mean_imputer',FunctionTransformer(func=imputer,validate=False),['age','pclass','who']),
    ('most_frequent',SimpleImputer(strategy='most_frequent'),['deck','embarked'])
],remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

trns2=ColumnTransformer(transformers=[
    ("ohe",OneHotEncoder(handle_unknown='ignore',sparse_output=False),nominal_col),
    ('ODE',OrdinalEncoder(categories=ordinal_order),ordinal_col),
],remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')


pipe=Pipeline(steps=[
    ('trns1',trns1),
    ('trns2',trns2),
    ("select K best features",SelectKBest(score_func=chi2,k=10)),
    ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
]).set_output(transform='pandas')

# pipe.fit(X,y)


0,1,2
,steps,"[('trns1', ...), ('trns2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('mean_imputer', ...), ('most_frequent', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function imp...0020E29EB8C10>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ohe', ...), ('ODE', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Third', 'Second', ...], ['man', 'woman', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,score_func,<function chi...0020E29EF2E60>
,k,10

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,100


## Grid Search CV

In [201]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier # Assuming this is available and you meant XGBClassifier


param_grid_list = [
    # 1. LogisticRegression
    {
        'classifier': [LogisticRegression(random_state=42)],
        # LBFGS only supports l2 or None penalty
        "classifier__penalty":['l2', None],
        'classifier__C':[0.01, 0.1, 1],
        "classifier__solver":['lbfgs', 'newton-cg', 'newton-cholesky', 'sag'],
    },
    {
        'classifier': [LogisticRegression(random_state=42)],
        # 'saga' solver supports l1, l2, and elasticnet
        "classifier__penalty":['l1', 'l2', 'elasticnet'],
        'classifier__C':[0.01, 0.1, 1],
        "classifier__solver":['saga']
    },
    # 2. SVC
    {
        'classifier': [SVC(random_state=42)],
        'classifier__kernel':['rbf','sigmoid'],
        'classifier__gamma':['auto',"scale"],
        "classifier__C":[0.1, 1.0]
    },

    # 3. DecisionTreeClassifier
    {
        'classifier': [DecisionTreeClassifier(random_state=42)],
        "classifier__max_depth":[None, 5, 10],
        "classifier__min_samples_split":[2, 5],
        "classifier__min_samples_leaf":[1, 2, 4]
    },

    # 4. AdaBoostClassifier
    {
        'classifier': [AdaBoostClassifier(random_state=42)],
        "classifier__n_estimators":[50, 100],
        'classifier__learning_rate':[0.1, 1.0],
    },

    # 5. RandomForestClassifier
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        "classifier__n_estimators":[100, 200],
        "classifier__max_depth":[None, 10],
    },

    # 6. GradientBoostingClassifier
    {
        'classifier': [GradientBoostingClassifier(random_state=42)],
        "classifier__n_estimators":[100, 200],
        "classifier__learning_rate":[0.1, 1.0],
    },

    # 7. XGBClassifier (Assuming you meant Classifier, not Regressor)
    {
        'classifier': [XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')], # Added common params
        "classifier__learning_rate":[0.1, 0.5],
        "classifier__n_estimators":[100, 200]
    },

    # 8. LGBMClassifier
    {
        'classifier':[LGBMClassifier(random_state=42)],
        'classifier__n_estimators':[100,200,300,500],
        'classifier__learning_rate':[0.01,0.1], # Removed very large learning rates (1, 1.2, 3, 5) which are rarely effective
    }
]

In [202]:
score=[]
for model_params in param_grid_list:
    model_name = model_params['classifier'][0]
    grid=GridSearchCV(estimator=pipe,param_grid=model_params,cv=5,verbose=0,n_jobs=-1,scoring='accuracy')
    grid.fit(X,y)
    score.append({'model':model_name,
                  'best_params':grid.best_params_,
                  "best_score":grid.best_score_})

result_df=pd.DataFrame(data=score)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
15 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "D:\DAIAugust2025\MYCondaEnvs\dl_env\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 224
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


In [203]:
result_df.sort_values(by='best_score',ascending=False)

Unnamed: 0,model,best_params,best_score
8,LGBMClassifier(random_state=42),{'classifier': LGBMClassifier(random_state=42)...,0.833928
6,GradientBoostingClassifier(random_state=42),{'classifier': GradientBoostingClassifier(rand...,0.830538
7,"XGBClassifier(base_score=None, booster=None, c...","{'classifier': XGBClassifier(base_score=None, ...",0.828291
5,RandomForestClassifier(random_state=42),{'classifier': RandomForestClassifier(random_s...,0.815994
3,DecisionTreeClassifier(random_state=42),{'classifier': DecisionTreeClassifier(random_s...,0.813728
4,AdaBoostClassifier(random_state=42),{'classifier': AdaBoostClassifier(random_state...,0.80585
0,LogisticRegression(random_state=42),{'classifier': LogisticRegression(random_state...,0.794595
2,SVC(random_state=42),"{'classifier': SVC(random_state=42), 'classifi...",0.759789
1,LogisticRegression(random_state=42),{'classifier': LogisticRegression(random_state...,0.68808


In [1]:
import pandas

In [None]:
df=pd.read_csv(r"", index_col=0)