# Chapter 05
## Handling categorical data

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

### 5.1 Encoding nominal categorical features  

In [None]:
feature = ([
    ['Texas'],
    ['California'],
    ['Texas'],
    {'Delaware'},
    ['Texas']
])

one_hot = MultiLabelBinarizer()

one_hot.fit_transform(feature)

In [None]:
one_hot.classes_

In [None]:
one_hot.inverse_transform(one_hot.transform(feature))

In [None]:
# One hot with Pandas
pd.get_dummies(pd.DataFrame(feature, columns=['State']))

In [None]:
multiclass_feature = [
    ('Texas', 'Florida'),
    ('California', 'Alabama'),
    ('Texas', 'Florida'),
    ('Delaware', 'Florida'),
    ('Texas', 'Alabama')
]

one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)

In [None]:
one_hot_multiclass.classes_

### 5.2 Encoding ordinal categorical features

In [None]:
dataframe = pd.DataFrame({'Score': ['Low', 'Low', 'Medium', 'Medium', 'High']})

scale_mapper = {
    'Low':1,
    'Medium':2,
    'High':3
}

dataframe['Score'].replace(scale_mapper)

### 5.3 Encoding dictionaries of features  

In [None]:
data_dict = [
    {'Red': 2, 'Blue': 4},
    {'Red': 4, 'Blue': 3},
    {'Red': 1, 'Yellow': 2},
    {'Red': 2, 'Yellow': 2}
]

dict_vectorizer = DictVectorizer(sparse=False)

features = dict_vectorizer.fit_transform(data_dict)

features

In [None]:
features_names = dict_vectorizer.get_feature_names_out()
features_names

In [None]:
pd.DataFrame(features, columns=features_names)

In [None]:
doc_1_word_count = {"Red": 2, "Blue": 4}
doc_2_word_count = {"Red": 4, "Blue": 3}
doc_3_word_count = {"Red": 1, "Yellow": 2}
doc_4_word_count = {"Red": 2, "Yellow": 2}

doc_word_counts = [
   doc_1_word_count,
   doc_2_word_count,
   doc_3_word_count,
   doc_4_word_count
]

dict_vectorizer.fit_transform(doc_word_counts)


### 5.4 Imputing missing class values

In [None]:
X = np.array([
    [0, 2.10, 1.45],
    [1, 1.18, 1.33],
    [0, 1.22, 1.27],
    [1, -0.21, -1.19]
])

X_with_nan = np.array([
    [np.nan, 0.87, 1.31],
    [np.nan, -0.67, -0.22]
])

# Train KNN learner
model = KNeighborsClassifier(3, weights='distance')
trained_model = model.fit(X[:, 1:], X[:, 0])

# Predict missing values' class
imputed_values = trained_model.predict(X_with_nan[:, 1:])

# Join column of predicted class with their other features
X_imputed = np.hstack((imputed_values.reshape(-1, 1), X_with_nan[:, 1:]))

# Join two feature matrices
np.vstack((X_imputed, X))


In [None]:
X_complete = np.vstack((X_with_nan, X))
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X_complete)

### 5.5 Handling imbalanced classes

In [None]:
iris = load_iris()
features = iris.data
target = iris.target

features = features[40:,:]
target = target[40:]

target = np.where((target == 0), 0, 1)
target

In [None]:
weights = {0: .9, 1: 0.1}
RandomForestClassifier(class_weight=weights)

In [None]:
RandomForestClassifier(class_weight='balanced')

In [None]:
# Downsampling

# Indices of each class' observations
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

# Number of observations in each class
n_class0 = len(i_class0)
n_class1 = len(i_class1)

# For every observation of class 0, randomly sample
# from class 1 without replacement
i_class1_downsampled = np.random.choice(i_class1, size = n_class0, replace = False)

np.hstack((target[i_class0], target[i_class1_downsampled]))
np.vstack((features[i_class0, :], features[i_class1_downsampled, :]))[0:5]


In [None]:
# Upsampling

# For every observation in class 1, randomly sample from class 0 with replacement
i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)

# Join together class 0's upsampled target vector with class 1's target vector
np.concatenate((target[i_class0_upsampled], target[i_class1]))

# Join togheter class 0's upsampled feature matrix with class 1's feature matrix
np.vstack((features[i_class0_upsampled, :], features[i_class1, :]))[0:5]
