In [None]:
# Encoding Nominal Categorical Features
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

feature = np.array([["Texas"],
                    ["California"],
                    ["Texas"],
                    ["Delware"],
                    ["Texas"]])

one_hot = LabelBinarizer()

one_hot.fit_transform(feature)

one_hot.classes_ # to view the classes

one_hot.inverse_transform(one_hot.transform(feature)) # to reverse the encoded classes


import pandas as pd

pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delware,Texas
0,False,False,True
1,True,False,False
2,False,False,True
3,False,True,False
4,False,False,True


In [8]:
multiclass_feature = [("Texas", "Florida"),
                      ("California", "Alabama"),
                      ("Texas", "Florida"),
                      ("Delware", "Florida"),
                      ("Texas", "Alabama")]

one_hot_multiclass = MultiLabelBinarizer()

one_hot_multiclass.fit_transform(multiclass_feature)

one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

In [10]:
# Encoding Ordinal Categories Features
import pandas as pd

dataframe= pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

scale_mapper = {"Low": 1,
                "Medium": 2,
                "High": 3}

dataframe["Score"].replace(scale_mapper)

  dataframe["Score"].replace(scale_mapper)


0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [12]:
dataframe = pd.DataFrame({"Score": ["Low", 
                                    "Low",
                                    "Medium", 
                                    "Medium", 
                                    "High",
                                    "Barely More Than Medium"]})

scale_mapper = {"Low": 1,
                "Medium": 2,
                "Barely More Than Medium": 3,
                "High": 4}

dataframe["Score"].replace(scale_mapper)

  dataframe["Score"].replace(scale_mapper)


0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [14]:
scale_mapper = {"Low": 1,
                "Medium": 2,
                "Barely More Than Medium": 2.1, # Best Approach
                "High": 3}

dataframe["Score"].replace(scale_mapper)

  dataframe["Score"].replace(scale_mapper)


0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

In [23]:
# Encoding Dictionaries of Features
from sklearn.feature_extraction import DictVectorizer

data_dict = [{"Red":2, "Blue":4},
             {"Red":4, "Blue":3},
             {"Red":1, "Yellow":2},
             {"Red":2, "Yellow":2}]

dictvectorizer = DictVectorizer(sparse=False)

features = dictvectorizer.fit_transform(data_dict)

features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [25]:
# Get feature names
feature_names = dictvectorizer.get_feature_names_out()

feature_names

array(['Blue', 'Red', 'Yellow'], dtype=object)

In [27]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [28]:
doc_1_word_count = {"Red":2, "Blue":4}
doc_2_word_count = {"Red":4, "Blue":3}
doc_3_word_count = {"Red":1, "Yellow":2}
doc_4_word_count = {"Red":2, "Yellow":2}

doc_word_counts = [doc_1_word_count,
                   doc_2_word_count,
                   doc_3_word_count,
                   doc_4_word_count]

dictvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [32]:
# Imputing Missing Class Values
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

X_with_nan = np.array([[np.nan, 0.87, 1.21],
                       [np.nan, -0.67, -0.22]])

# Train KNN Learner
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:,1:], X[:,0])

# Predict class of missing values
imputed_values = trained_model.predict(X_with_nan[:,1:])
imputed_values

# Join predicted values with their other features
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))

#Join two matrices
np.vstack((X_with_imputed,X))

array([[ 0.  ,  0.87,  1.21],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [34]:
# alternative solution, fill in missing values with most frequent value
from sklearn.impute import SimpleImputer

X_complete = np.vstack((X_with_nan, X))

imputer = SimpleImputer(strategy="most_frequent")

imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.21],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [40]:
# Handling Imbalanced Classes
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

iris = load_iris()

features = iris.data
target = iris.target

# removing first 40 observations (setosa), to make imbalanced classes
features = features[40:,:]
target = target[40:]

target = np.where((target==0), 0, 1)

target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [43]:
weights = {0: 0.9, 1: 0.1}

RandomForestClassifier(class_weight=weights)

RandomForestClassifier(class_weight='balanced')

In [47]:
# Downsampling

# indices of each observation
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

n_class0 = len(i_class0)
n_class1 = len(i_class1)

i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)

# Join indices
np.hstack((target[i_class0], target[i_class1_downsampled]))

# Join matrices
np.vstack((features[i_class0,:], features[i_class1_downsampled]))

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [5.6, 2.7, 4.2, 1.3],
       [6.7, 3. , 5.2, 2.3],
       [6.7, 2.5, 5.8, 1.8],
       [6. , 2.7, 5.1, 1.6],
       [6.7, 3.3, 5.7, 2.5],
       [7.7, 3.8, 6.7, 2.2],
       [6.9, 3.1, 4.9, 1.5],
       [6.5, 3. , 5.8, 2.2],
       [6.9, 3.2, 5.7, 2.3],
       [5.7, 3. , 4.2, 1.2]])

In [53]:
# Upsampling

i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)

np.concatenate((target[i_class0_upsampled], target[i_class1]))

np.vstack((features[i_class0_upsampled,:], features[i_class1,:]))

array([[5. , 3.5, 1.6, 0.6],
       [5. , 3.3, 1.4, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [4.6, 3.2, 1.4, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [4.6, 3.2, 1.4, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 3.5, 1.6, 0.6],
       [4.5, 2.3, 1.3, 0.3],
       [4.6, 3.2, 1.4, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [4.5, 2.3, 1.3, 0.3],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [4.5, 2.3, 1.3, 0.3],
       [5.1, 3