In [101]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [102]:
df=pd.read_csv("train.csv")
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [103]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [104]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [105]:
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=["Survived"]),df["Survived"],test_size=0.2,random_state=0)

In [106]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
140,3,female,,0,2,15.2458,C
439,2,male,31.0,0,0,10.5,S
817,2,male,31.0,1,1,37.0042,C
378,3,male,20.0,0,0,4.0125,C
491,3,male,21.0,0,0,7.25,S


In [107]:
x_test.shape

(179, 7)

In [108]:
x_train.isnull().sum()

Pclass        0
Sex           0
Age         141
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [124]:
categorical=["Sex","Embarked"]
categorical_transform=Pipeline([
    ("impute1",SimpleImputer(strategy="most_frequent")),
    ("encode",OneHotEncoder(drop="first",handle_unknown="ignore",sparse_output=False))
])
numerical=["Age","Fare"]
numerical_transform=Pipeline([
    ("impute2",SimpleImputer()),
    ("scale",StandardScaler())
])    

In [125]:
column=ColumnTransformer(transformers=[
    ("categorical",categorical_transform,categorical),
    ("numerical",numerical_transform,numerical)
],remainder="passthrough")

In [126]:
pipe=Pipeline([
    ("column",column),
    ("model",LogisticRegression())
])

In [127]:
pipe.fit(x_train,y_train)

0,1,2
,steps,"[('column', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [135]:
param_grid = {
    "column__categorical__impute1__strategy": ["most_frequent", "constant"],
    "column__numerical__impute2__strategy": ["mean", "median"],
    "model__C": [0.1, 1, 10,100]
}

grid=GridSearchCV(pipe,param_grid,cv=10)

# so basically aaplyala yat ek ek kart aat madhe java lagt so in the above code you can see this
# between both this there should be double__
#

In [136]:
grid.fit(x_train,y_train)

0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'column__categorical__impute1__strategy': ['most_frequent', 'constant'], 'column__numerical__impute2__strategy': ['mean', 'median'], 'model__C': [0.1, 1, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [137]:
y_pred=grid.predict(x_test)

In [138]:
print("best params",(grid.best_params_))

best params {'column__categorical__impute1__strategy': 'most_frequent', 'column__numerical__impute2__strategy': 'mean', 'model__C': 0.1}


In [139]:
print("internal_cv_score",(grid.best_score_))

internal_cv_score 0.8034233176838811


In [140]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_model__C','param_column__categorical__impute1__strategy','param_column__numerical__impute2__strategy','mean_test_score']]

Unnamed: 0,param_model__C,param_column__categorical__impute1__strategy,param_column__numerical__impute2__strategy,mean_test_score
0,0.1,most_frequent,mean,0.803423
4,0.1,most_frequent,median,0.803423
8,0.1,constant,mean,0.803423
12,0.1,constant,median,0.802015
1,1.0,most_frequent,mean,0.785192
2,10.0,most_frequent,mean,0.785192
3,100.0,most_frequent,mean,0.785192
5,1.0,most_frequent,median,0.785192
6,10.0,most_frequent,median,0.785192
7,100.0,most_frequent,median,0.785192
