### Feature engineering - CategoryEncoding - Indexers
CategoryEncoding with sklearn on the Titanic dataset 

*** 
#### Environment
`conda activate sklearn-env`

***
#### Goals
- Replace categorical and discrete columns with categorical elements corespondng to each value fomr dataset

***
#### References
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html

#### Basic python imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load using sklearn API from https://www.openml.org site

https://www.openml.org/d/40945

In [None]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
raw_dataset = fetch_openml("titanic", version=1, as_frame=True).frame
dataset = raw_dataset.copy()
dataset.head(10)

In [None]:
dataset.drop(['boat', 'body', 'home.dest'],  axis=1, inplace=True)
dataset = dataset.dropna().copy()

### Verify categorical /discrete fields in dataset

Notice:
- `pclass` discrete fiels having 3 distinct values.
- `sex` and `embarked` fields - categorical

In [None]:
dataset.info()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder().fit(dataset[['pclass', 'sex', 'embarked']])
encoder.categories_

In [None]:
new_dataset = dataset.copy()
new_dataset[['pclass', 'sex', 'embarked']] = encoder.transform(new_dataset[['pclass', 'sex', 'embarked']])

### Notice changed column type for 'pclass', 'sex', 'embarked'

In [None]:
new_dataset.head(10)

#### Predict and compare results

In [None]:
from sklearn.preprocessing import MinMaxScaler

def raw_dataset(df):
    dataset = df[['age','sibsp','parch', 'fare', 'survived']].copy()
    return dataset
    
def prepared_dataset(df):
    dataset = df.copy().drop(['name', 'fare', 'ticket', 'cabin'], axis='columns')
    dataset[['age']] = MinMaxScaler().fit_transform(dataset[['age']])
    return dataset

#use the prepared data set
dataset = prepared_dataset(new_dataset)

#or use the raw dataset
#dataset = raw_dataset(dataset)

train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('survived')
test_labels = test_features.pop('survived')

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regressor = LogisticRegression(max_iter=1000).fit(train_features, train_labels)

scored_test = logistic_regressor.predict(test_features)
scored_test_proba = logistic_regressor.predict_proba(test_features)
test_dataset['predicted'] = scored_test

print("Training converged in %d iterations" % (logistic_regressor.n_iter_))

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test_dataset['survived'], test_dataset['predicted'])
print("Accuracy of the model is %f" % acc)

In [None]:
test_dataset[['survived', 'predicted']].head(20)