In [44]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import LogisticRegression

In [45]:
df = pd.read_csv('dataset/kidney_stone_data.csv')
df

Unnamed: 0,treatment,stone_size,success
0,B,large,1
1,A,large,1
2,A,large,0
3,A,large,1
4,A,large,1
...,...,...,...
695,B,small,0
696,B,small,1
697,B,small,1
698,A,large,1


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   treatment   700 non-null    object
 1   stone_size  700 non-null    object
 2   success     700 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 16.5+ KB


In [47]:
df.describe()

Unnamed: 0,success
count,700.0
mean,0.802857
std,0.398126
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [48]:
df.isnull().sum()

treatment     0
stone_size    0
success       0
dtype: int64

In [49]:
x = pd.DataFrame(df.iloc[:,0:2])

In [50]:
y = df['success']

In [51]:
x_train,x_test,y_train,y_test = train_test_split(x ,y , train_size=0.8 , random_state=42)

In [52]:
trf1 = ColumnTransformer(transformers=[
    ('treatment_ohe',OneHotEncoder(drop='first',sparse=False,handle_unknown='ignore'),[0])
],remainder='passthrough')

In [53]:
trf2 = ColumnTransformer(transformers=[
    ('stone_size_oe',OrdinalEncoder(categories=[['small','large']]),[1])
],remainder='passthrough')

In [54]:
trf3 = SelectKBest(score_func=chi2,k=2)

In [55]:
trf4 = LogisticRegression()

In [58]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
])

In [63]:
from sklearn import set_config
set_config(display='diagram')

In [64]:
pipe.fit(x_train,y_train)

In [65]:
y_pred = pipe.predict(x_test)

In [75]:
pd.DataFrame(y_pred)

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
135,1
136,1
137,1
138,1


In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test , y_pred)

0.7571428571428571

**cross validation**

In [79]:
from sklearn.model_selection import cross_val_score
#cross_val_score(pipe , x_train , y_train , cv=5 , scoring='accuracy')
cross_val_score(pipe , x_train , y_train , cv=5 , scoring='accuracy').mean()

0.8142857142857143

In [83]:
import pickle
pickle.dump(pipe,open('ksd.pkl','wb'))