In [2]:
!git clone https://github.com/denisvivdenko/titanic_practice_dataset_V2.git

Cloning into 'titanic_practice_dataset_V2'...
remote: Enumerating objects: 88, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 88 (delta 37), reused 65 (delta 19), pack-reused 0[K
Unpacking objects: 100% (88/88), done.


In [3]:
!cd titanic_practice_dataset_V2/

/content/titanic_practice_dataset_V2


In [97]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [4]:
df = pd.read_csv("src/datasets/train.csv")

In [39]:
df.shape

(891, 12)

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [40]:
target = df.Survived

In [41]:
age_sex = df[["Sex", "Age"]]
age_sex.head()

Unnamed: 0,Sex,Age
0,male,22.0
1,female,38.0
2,female,26.0
3,female,35.0
4,male,35.0


In [42]:
age_sex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sex     891 non-null    object 
 1   Age     714 non-null    float64
dtypes: float64(1), object(1)
memory usage: 14.0+ KB


In [43]:
@FunctionTransformer
def log_transform(feature):
  min_value = np.min(feature)
  if min_value < 1:
    feature += abs(min_value + 1)
  return np.log(feature)

log_transform.fit_transform(age_sex["Age"])

0      3.153590
1      3.674273
2      3.311273
3      3.595118
4      3.595118
         ...   
886    3.347093
887    3.016515
888         NaN
889    3.311273
890    3.509155
Name: Age, Length: 891, dtype: float64

In [76]:
age_pipeline = Pipeline([
  ("missing_values", SimpleImputer()),
  ("log_transform", log_transform)
])

In [77]:
age_pipeline.fit_transform([age_sex["Age"]])[0][:10]

array([3.15359036, 3.6742733 , 3.31127267, 3.59511807, 3.59511807,
       4.01494054, 1.22964055, 3.34709312, 2.73566537, 1.69009582])

In [78]:
pipeline = ColumnTransformer([
  ("sex", OneHotEncoder(drop="first"), ["Sex"]),
  ("age", age_pipeline, ["Age"])
])

In [79]:
X_train = pipeline.fit_transform(age_sex)
X_train

array([[1.        , 3.15359036],
       [0.        , 3.6742733 ],
       [0.        , 3.31127267],
       ...,
       [0.        , 3.43782235],
       [1.        , 3.31127267],
       [1.        , 3.50915452]])

In [92]:
clf_dt = DecisionTreeClassifier()
cross_val_score(clf_dt, X_train, target, scoring="precision").mean()

0.7404661042459925

In [93]:
clf_logreg = LogisticRegression()
cross_val_score(clf_logreg, X_train, target, scoring="precision").mean()

0.7413210457575212

In [98]:
clf_svc = SVC()
cross_val_score(clf_svc, X_train, target, scoring="precision").mean()

0.7401196286335807