In [None]:
import pandas as pd

# Issues with mixed type data

In [None]:
df = pd.read_csv(
    "https://www.openml.org/data/get_csv/16826755/phpMYEkMl.csv",
    na_values='?'
)
df.head()

In [None]:
X_df = df.drop(columns='survived')
y = df['survived']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, random_state=42
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Working only with numerical data

## Pandas preprocessing

In [None]:
num_cols = ['age', 'pclass', 'parch', 'fare']

X_train_num = X_train[num_cols]

In [None]:
model.fit(X_train_num, y_train)

In [None]:
X_train_num.info()

In [None]:
X_train_num_imputed = X_train_num.fillna(X_train_num.mean())
X_train_num_imputed.info()

In [None]:
model.fit(X_train_num_imputed, y_train)

In [None]:
X_test_num = X_test[num_cols]
X_test_num_imputed = X_test_num.fillna(X_train_num.mean())
X_test_num_imputed.info()

In [None]:
model.score(X_test_num_imputed, y_test)

## Make it less error prone using scikit-learn

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

numerical_preprocessing = make_column_transformer(
    (SimpleImputer(strategy='mean'), num_cols)
)
model = make_pipeline(numerical_preprocessing, RandomForestClassifier())
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

# Working only with categorical data

In [None]:
X_train.head()

In [None]:
cat_col = ['sex', 'embarked', 'pclass']

In [None]:
X_train_cat = X_train[cat_col]

In [None]:
X_train_cat.info()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessing = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'),
                   OrdinalEncoder()),
     cat_col)
)
model = make_pipeline(categorical_preprocessing, RandomForestClassifier())
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

# Combining both categorical and numerical data in the pipeline

In [None]:
preprocessing = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'),
                   OrdinalEncoder()),
     cat_col),
    (SimpleImputer(strategy='mean'), num_cols)
)

model = make_pipeline(preprocessing, RandomForestClassifier())
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)