## Categorical features - encoding and distances

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder 
from sklearn.metrics import pairwise_distances

from scipy.spatial.distance import hamming, euclidean, pdist, squareform

## Soyabeen small dataset

A dataset of soybean plant observations, including information on plants infested by one of four diseases.

- Dataset contains 47 instances.
- Each instance represents a single plant.
- Characterized by 35 attributes.
- Attributes are categorical.
- Attributes mostly capture various symptoms like leaf spots, root rot, mold growth, seed damage, etc.

Citation:

- Michalski,R.. (1987). Soybean (Small). UCI Machine Learning Repository. https://doi.org/10.24432/C5DS3P.

First 2 features:

1. **Date**: The time when the soybean sample was collected. May be represented as a date or as the day of the year.
  
2. **Hail**: Indicates whether the plants have been affected by hail, generally a binary "yes" or "no."

3. **Germination**: Describes the rate of germination.

In [3]:
soybean_path = 'data/soybean_data_use.csv'

soy_df = pd.read_csv(soybean_path)

soy_df = soy_df.loc[:10, ['date', 'hail', 'germination']]

soy_df

Unnamed: 0,date,hail,germination
0,august,no,lt-80%
1,september,yes,lt-80%
2,july,yes,80-89%
3,october,yes,90-100%
4,august,yes,lt-80%
5,september,yes,90-100%
6,july,yes,80-89%
7,july,yes,lt-80%
8,october,yes,80-89%
9,october,yes,lt-80%


In [4]:
soy_df['date'].unique()

array(['august', 'september', 'july', 'october'], dtype=object)

In [5]:
soy_df['hail'].unique()

array(['no', ' yes'], dtype=object)

In [6]:
soy_df['germination'].unique()

array(['lt-80%', '80-89%', ' 90-100%'], dtype=object)

### Hamming distance

In [7]:
soy_df.loc[:1, :]

Unnamed: 0,date,hail,germination
0,august,no,lt-80%
1,september,yes,lt-80%


In [11]:
hamming(soy_df.loc[0].to_numpy(), soy_df.loc[1].to_numpy())

np.float64(0.6666666666666666)

In [None]:
# dst = pdist(soy_df.to_numpy(), metric='hamming')
# dst_matrix = squareform(dst)
# pd.DataFrame(dst_matrix)

In [13]:
soy_df

Unnamed: 0,date,hail,germination
0,august,no,lt-80%
1,september,yes,lt-80%
2,july,yes,80-89%
3,october,yes,90-100%
4,august,yes,lt-80%
5,september,yes,90-100%
6,july,yes,80-89%
7,july,yes,lt-80%
8,october,yes,80-89%
9,october,yes,lt-80%


In [14]:
or_encoder = OrdinalEncoder() 
soy_df_enc = or_encoder.fit_transform(soy_df)
soy_df_enc

array([[0., 1., 2.],
       [3., 0., 2.],
       [1., 0., 1.],
       [2., 0., 0.],
       [0., 0., 2.],
       [3., 0., 0.],
       [1., 0., 1.],
       [1., 0., 2.],
       [2., 0., 1.],
       [2., 0., 2.],
       [2., 1., 0.]])

In [15]:
dst = pdist(soy_df_enc, metric='hamming')
dst

array([0.66666667, 1.        , 1.        , 0.33333333, 1.        ,
       1.        , 0.66666667, 1.        , 0.66666667, 0.66666667,
       0.66666667, 0.66666667, 0.33333333, 0.33333333, 0.66666667,
       0.33333333, 0.66666667, 0.33333333, 1.        , 0.66666667,
       0.66666667, 0.66666667, 0.        , 0.33333333, 0.33333333,
       0.66666667, 1.        , 0.66666667, 0.33333333, 0.66666667,
       0.66666667, 0.33333333, 0.33333333, 0.33333333, 0.66666667,
       0.66666667, 0.33333333, 0.66666667, 0.33333333, 1.        ,
       0.66666667, 0.66666667, 0.66666667, 0.66666667, 0.66666667,
       0.33333333, 0.33333333, 0.66666667, 1.        , 0.66666667,
       0.33333333, 1.        , 0.33333333, 0.66666667, 0.66666667])

In [16]:
dst_matrix = squareform(dst)
pd.DataFrame(dst_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.666667,1.0,1.0,0.333333,1.0,1.0,0.666667,1.0,0.666667,0.666667
1,0.666667,0.0,0.666667,0.666667,0.333333,0.333333,0.666667,0.333333,0.666667,0.333333,1.0
2,1.0,0.666667,0.0,0.666667,0.666667,0.666667,0.0,0.333333,0.333333,0.666667,1.0
3,1.0,0.666667,0.666667,0.0,0.666667,0.333333,0.666667,0.666667,0.333333,0.333333,0.333333
4,0.333333,0.333333,0.666667,0.666667,0.0,0.666667,0.666667,0.333333,0.666667,0.333333,1.0
5,1.0,0.333333,0.666667,0.333333,0.666667,0.0,0.666667,0.666667,0.666667,0.666667,0.666667
6,1.0,0.666667,0.0,0.666667,0.666667,0.666667,0.0,0.333333,0.333333,0.666667,1.0
7,0.666667,0.333333,0.333333,0.666667,0.333333,0.666667,0.333333,0.0,0.666667,0.333333,1.0
8,1.0,0.666667,0.333333,0.333333,0.666667,0.666667,0.333333,0.666667,0.0,0.333333,0.666667
9,0.666667,0.333333,0.666667,0.333333,0.333333,0.666667,0.666667,0.333333,0.333333,0.0,0.666667


In [None]:
# Same using sklearn
dst_matrix1 = pairwise_distances(soy_df_enc, metric='hamming')
pd.DataFrame(dst_matrix)

In [None]:
np.array_equal(dst_matrix, dst_matrix1)

### Euclidean distance

In [17]:
oh_encoder = OneHotEncoder(sparse_output=False) 
soy_df_oh_enc = oh_encoder.fit_transform(soy_df)

In [18]:
soy_df.nunique()

date           4
hail           2
germination    3
dtype: int64

In [19]:
soy_df_oh_enc

array([[1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 1., 0., 0.]])

In [20]:
soy_df_oh_enc.shape

(11, 9)

In [21]:
euclidean(soy_df_oh_enc[0,:], soy_df_oh_enc[1,:])

2.0

In [22]:
dst = pdist(soy_df_oh_enc, metric='euclidean')
dst_matrix = squareform(dst)
pd.DataFrame(dst_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,2.0,2.44949,2.44949,1.414214,2.44949,2.44949,2.0,2.44949,2.0,2.0
1,2.0,0.0,2.0,2.0,1.414214,1.414214,2.0,1.414214,2.0,1.414214,2.44949
2,2.44949,2.0,0.0,2.0,2.0,2.0,0.0,1.414214,1.414214,2.0,2.44949
3,2.44949,2.0,2.0,0.0,2.0,1.414214,2.0,2.0,1.414214,1.414214,1.414214
4,1.414214,1.414214,2.0,2.0,0.0,2.0,2.0,1.414214,2.0,1.414214,2.44949
5,2.44949,1.414214,2.0,1.414214,2.0,0.0,2.0,2.0,2.0,2.0,2.0
6,2.44949,2.0,0.0,2.0,2.0,2.0,0.0,1.414214,1.414214,2.0,2.44949
7,2.0,1.414214,1.414214,2.0,1.414214,2.0,1.414214,0.0,2.0,1.414214,2.44949
8,2.44949,2.0,1.414214,1.414214,2.0,2.0,1.414214,2.0,0.0,1.414214,2.0
9,2.0,1.414214,2.0,1.414214,1.414214,2.0,2.0,1.414214,1.414214,0.0,2.0


In [23]:
dst_matrix1 = pairwise_distances(soy_df_oh_enc, metric='euclidean')
pd.DataFrame(dst_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,2.0,2.44949,2.44949,1.414214,2.44949,2.44949,2.0,2.44949,2.0,2.0
1,2.0,0.0,2.0,2.0,1.414214,1.414214,2.0,1.414214,2.0,1.414214,2.44949
2,2.44949,2.0,0.0,2.0,2.0,2.0,0.0,1.414214,1.414214,2.0,2.44949
3,2.44949,2.0,2.0,0.0,2.0,1.414214,2.0,2.0,1.414214,1.414214,1.414214
4,1.414214,1.414214,2.0,2.0,0.0,2.0,2.0,1.414214,2.0,1.414214,2.44949
5,2.44949,1.414214,2.0,1.414214,2.0,0.0,2.0,2.0,2.0,2.0,2.0
6,2.44949,2.0,0.0,2.0,2.0,2.0,0.0,1.414214,1.414214,2.0,2.44949
7,2.0,1.414214,1.414214,2.0,1.414214,2.0,1.414214,0.0,2.0,1.414214,2.44949
8,2.44949,2.0,1.414214,1.414214,2.0,2.0,1.414214,2.0,0.0,1.414214,2.0
9,2.0,1.414214,2.0,1.414214,1.414214,2.0,2.0,1.414214,1.414214,0.0,2.0
