<a href="https://colab.research.google.com/github/chandan-nagaraju/python/blob/main/handling_categorical_data_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Sets of categories with no intrinsic ordering is
called nominal


categorical information is often represented in data as a vector or col‐
umn of strings (e.g., "Maine", "Texas", "Delaware"). The problem is that most
machine learning algorithms require inputs be numerical values


##1Encoding Nominal Categorical Features

In [1]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [2]:
feature = np.array([["Texas"],
 ["California"],
 ["Texas"],
 ["Delaware"],
 ["Texas"]])

In [3]:
one_hot=LabelBinarizer()

In [4]:
one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [5]:
#to view freature classes
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [6]:
#to reverse one hot encoding 
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

We can even use pandas to one-hot encode the feature

In [7]:
import pandas as pd

In [9]:
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


## For multi class feature

In [10]:
multiclass_feature = [("Texas", "Florida"),
 ("California", "Alabama"),
 ("Texas", "Florida"),
 ("Delware", "Florida"),
 ("Texas", "Alabama")]

In [11]:
one_hot_multiclass=MultiLabelBinarizer()

In [12]:
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [14]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

## one-hot encoding or dummying
The proper strategy is to create a binary feature for each class in the original feature.


# *In contrast, when a set of categories has some natural ordering we refer to it as ordinal

#2Encoding Ordinal Categorical Features

In [17]:
df=pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

In [18]:
scale_mapper={"Low":1,
              "Medium":2,
              "High":3}

In [19]:
df['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [23]:
#example-2
df1=pd.DataFrame({"Score": ["Low",
 "Low",
"Medium",
"Medium",
"High",
"Barely More Than Medium"]})

In [21]:
scale_mapper={"Low":1,
              "Medium":2,
              "Barely More Than Medium":3,
              "High":4}

In [22]:
df1['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

#3 Encoding Dictionaries of Features

In [24]:
from sklearn.feature_extraction import DictVectorizer

In [25]:
data_dict = [{"Red": 2, "Blue": 4},
 {"Red": 4, "Blue": 3},
 {"Red": 1, "Yellow": 2},
 {"Red": 2, "Yellow": 2}]

In [26]:
dv=DictVectorizer(sparse=False)

In [27]:
features=dv.fit_transform(data_dict)

In [28]:
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [29]:
feature_names=dv.get_feature_names()

In [30]:
feature_names

['Blue', 'Red', 'Yellow']

In [31]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


4. Handling Imbalanced Classes

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris


In [33]:
iris=load_iris()

In [34]:
features = iris.data

In [35]:
target=iris.target

In [37]:
features=feature[40:,:]
target=target[40:]

In [38]:
target = np.where((target == 0), 0, 1)

In [39]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [42]:
RandomForestClassifier(class_weight='balanced')

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [44]:
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

In [45]:
n_class0 = len(i_class0)
n_class1 = len(i_class1)

In [50]:
i_class0



array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [51]:
i_class1

array([ 10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,
        23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
        36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
        49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
        62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
        75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,
        88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100,
       101, 102, 103, 104, 105, 106, 107, 108, 109])

In [48]:
n_class0


10

In [49]:
n_class1

100

In [55]:
i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)


In [56]:
i_class1_downsampled

array([ 14,  72,  28,  10,  15,  64,  62,  27, 101,  70])

In [57]:
np.hstack((target[i_class0], target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])