### Feature engineering - CategoryEncoding - OneHotEncoder
OneHotEncoder with sklearn on the Titanic dataset 

***
#### Environment
`conda activate sklearn-env`

***
#### Goals
- Replace categorical and discrete columns with categorical elements corespondng to each value from dataset
- Note that name is not identified as a categorical value

***
#### References
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html


#### Basic python imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load using sklearn API from https://www.openml.org site

https://www.openml.org/d/40945

In [None]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
raw_dataset = fetch_openml("titanic", version=1, as_frame=True).frame
dataset = raw_dataset.copy()
dataset.head(10)

In [None]:
dataset.drop(['boat', 'body', 'home.dest'],  axis=1, inplace=True)
dataset = dataset.dropna().copy()
dataset.reset_index(drop=True, inplace=True)

### Verify categorical /discrete fields in dataset

Notice:
- `pclass` discrete fiels having 3 distinct values.
- `sex` and `embarked` fields - categorical

In [None]:
dataset.info()

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder().fit(dataset[['pclass', 'sex', 'embarked']])
encoder.categories_

In [None]:
ohe_f = encoder.transform(dataset[['pclass', 'sex', 'embarked']])

In [None]:
ohe_dataset = pd.DataFrame(ohe_f.todense())

ohe_dataset.columns =['pclass_1', 'pclass_2', 'pclass_3', 'sex_F', 'sex_M', 'embarked_C', 'embarked_Q', 'embarked_S']

# the 2 datasets must have the same indexes or this operation will introduce NaNs
new_dataset = pd.concat([dataset, ohe_dataset], axis=1)

new_dataset.sample(7)

#### Predict and compare results

In [None]:
from sklearn.preprocessing import MinMaxScaler

def raw_dataset(df):
    dataset = df[['age','sibsp','parch', 'fare', 'survived' ]].copy()
    return dataset
    
def prepared_dataset(df):
    dataset = df.copy().drop(['name', 'fare', 'ticket', 'cabin', 'pclass', 'sex', 'embarked'], axis='columns')
    dataset[['age']] = MinMaxScaler().fit_transform(dataset[['age']])
    dataset = dataset.copy().drop(['pclass_3', 'sex_M', 'embarked_S'], axis='columns')
    return dataset

def converge_error_dataset(df):
    dataset = df.copy().drop(['name', 'ticket', 'sex', 'cabin', 'embarked'], axis='columns')
    return dataset
    
#use the prepared data set
dataset = prepared_dataset(new_dataset)

#use the semi-prepared data set that will report converge errors
#dataset = converge_error_dataset(new_dataset)

#or use the raw dataset
#dataset = raw_dataset(dataset)

train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('survived')
test_labels = test_features.pop('survived')

dataset.sample(5)

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regressor = LogisticRegression().fit(train_features, train_labels)

scored_test = logistic_regressor.predict(test_features)
scored_test_proba = logistic_regressor.predict_proba(test_features)
test_dataset['predicted'] = scored_test

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test_dataset['survived'], test_dataset['predicted'])
print("Accuracy of the model is %f" % acc)

In [None]:
test_dataset[['survived', 'predicted']].head(20)