In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [4]:
df = pd.read_csv('train.csv',usecols = ['Age','Fare','Survived'])


In [5]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [6]:
df.dropna(inplace = True)

In [7]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [8]:
df.shape

(714, 3)

In [9]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [10]:
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=42)

In [11]:
X_train.head()

Unnamed: 0,Age,Fare
328,31.0,20.525
73,26.0,14.4542
253,30.0,16.1
719,33.0,7.775
666,25.0,13.0


In [14]:
clf= DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [15]:
y_pred = clf.predict(X_test)

In [16]:
accuracy_score(y_test, y_pred)

0.6363636363636364

In [17]:
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))


0.6303012519561815

In [18]:
kbin_age = KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile')
kbin_fare = KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile')

In [19]:
trf = ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbin_fare,[1])
])

In [21]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.fit_transform(X_test)

In [22]:
trf.named_transformers_['first'].bin_edges_


array([array([ 1.        ,  9.        , 15.9       , 17.        , 20.        ,
              21.        , 24.        , 26.        , 28.        , 30.1       ,
              33.66666667, 36.        , 38.6       , 44.06666667, 52.        ,
              62.        ])                                                   ],
      dtype=object)

In [23]:
output = pd.DataFrame({
    'age':X_train['Age'],
    'age_trf':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_trf':X_train_trf[:,1]
})

In [24]:
output['age_labels'] = pd.cut(x=X_train['Age'],
                                    bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x=X_train['Fare'],
                                    bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [25]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
153,40.5,11.0,14.5,7.0,"(38.6, 44.067]","(13.0, 17.693]"
15,55.0,14.0,16.0,7.0,"(52.0, 62.0]","(13.0, 17.693]"
155,51.0,13.0,61.3792,12.0,"(44.067, 52.0]","(53.227, 79.277]"
291,19.0,3.0,91.0792,13.0,"(17.0, 20.0]","(79.277, 512.329]"
857,51.0,13.0,26.55,10.0,"(44.067, 52.0]","(26.353, 31.275]"


In [26]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf,y_train)
y_pred2 = clf.predict(X_test_trf)

In [27]:

accuracy_score(y_test,y_pred2)

0.6503496503496503