In [1]:
import pandas as pd

# Issues with mixed type data

In [21]:
df = pd.read_csv(
    "https://www.openml.org/data/get_csv/16826755/phpMYEkMl.csv",
    na_values='?'
)
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [22]:
X_df = df.drop(columns='survived')
y = df['survived']

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, random_state=42
)

In [24]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'Rekic, Mr. Tido'

# Working only with numerical data

## Pandas preprocessing

In [25]:
num_cols = ['age', 'pclass', 'parch', 'fare']

X_train_num = X_train[num_cols]

In [26]:
model.fit(X_train_num, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [27]:
X_train_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 1139 to 1126
Data columns (total 4 columns):
age       784 non-null float64
pclass    981 non-null int64
parch     981 non-null int64
fare      980 non-null float64
dtypes: float64(2), int64(2)
memory usage: 38.3 KB


In [28]:
X_train_num_imputed = X_train_num.fillna(X_train_num.mean())
X_train_num_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 1139 to 1126
Data columns (total 4 columns):
age       981 non-null float64
pclass    981 non-null int64
parch     981 non-null int64
fare      981 non-null float64
dtypes: float64(2), int64(2)
memory usage: 38.3 KB


In [29]:
model.fit(X_train_num_imputed, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [30]:
X_test_num = X_test[num_cols]
X_test_num_imputed = X_test_num.fillna(X_train_num.mean())
X_test_num_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 328 entries, 1148 to 533
Data columns (total 4 columns):
age       328 non-null float64
pclass    328 non-null int64
parch     328 non-null int64
fare      328 non-null float64
dtypes: float64(2), int64(2)
memory usage: 12.8 KB


In [32]:
model.score(X_test_num_imputed, y_test)

0.6676829268292683

## Make it less error prone using scikit-learn

In [34]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

numerical_preprocessing = make_column_transformer(
    (SimpleImputer(strategy='mean'), num_cols)
)
model = make_pipeline(numerical_preprocessing, RandomForestClassifier())
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['age', 'pclass', 'parch',
                                                   'fare'])],
                                   verbose=False)),
                

In [35]:
model.score(X_test, y_test)

0.6737804878048781

# Working only with categorical data

In [36]:
X_train.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1139,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S,,,
678,3,"Boulos, Master. Akar",male,6.0,1,1,2678,15.2458,,C,,,"Syria Kent, ON"
290,1,"Taussig, Mr. Emil",male,52.0,1,1,110413,79.65,E67,S,,,"New York, NY"
285,1,"Straus, Mr. Isidor",male,67.0,1,0,PC 17483,221.7792,C55 C57,S,,96.0,"New York, NY"
1157,3,"Rosblom, Mr. Viktor Richard",male,18.0,1,1,370129,20.2125,,S,,,


In [43]:
cat_col = ['sex', 'embarked', 'pclass']

In [44]:
X_train_cat = X_train[cat_col]

In [45]:
X_train_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 1139 to 1126
Data columns (total 3 columns):
sex         981 non-null object
embarked    980 non-null object
pclass      981 non-null int64
dtypes: int64(1), object(2)
memory usage: 30.7+ KB


In [46]:
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessing = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'),
                   OrdinalEncoder()),
     cat_col)
)
model = make_pipeline(categorical_preprocessing, RandomForestClassifier())
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='missing',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                      

In [47]:
model.score(X_test, y_test)

0.7713414634146342

# Combining both categorical and numerical data in the pipeline

In [48]:
preprocessing = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'),
                   OrdinalEncoder()),
     cat_col),
    (SimpleImputer(strategy='mean'), num_cols)
)

model = make_pipeline(preprocessing, RandomForestClassifier())
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='missing',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                      

In [50]:
model.score(X_test, y_test)

0.7957317073170732