-----------------------------------
#### Gower distance 
- a metric designed to handle mixed data types, including categorical and numerical features.
- To use Gower distance for KNN, you can define a custom distance function that calculates the Gower distance between two data points.
---------------------------

In [2]:
#pip install gower

#### Example 01

In [57]:
import pandas as pd
from gower import gower_matrix

In [58]:
# Create a small dataset with numerical and categorical features
data = {
    'Numerical1':   [5, 10, 8, 3],
    'Categorical1': ['Red', 'Blue', 'Green', 'Red'],
    'Numerical2':   [1.5, 2.0, 1.8, 1.2],
    'Categorical2': ['Small', 'Medium', 'Large', 'Small']
}

In [59]:
df = pd.DataFrame(data)
df

Unnamed: 0,Numerical1,Categorical1,Numerical2,Categorical2
0,5,Red,1.5,Small
1,10,Blue,2.0,Medium
2,8,Green,1.8,Large
3,3,Red,1.2,Small


In [60]:
# Calculate Gower matrix
gower_distances = gower_matrix(df)

In [61]:
gower_distances

array([[0.        , 0.83482146, 0.70089287, 0.16517858],
       [0.83482146, 0.        , 0.6339286 , 1.        ],
       [0.70089287, 0.6339286 , 0.        , 0.86607146],
       [0.16517858, 1.        , 0.86607146, 0.        ]], dtype=float32)

#### Example 02

In [62]:
# Example DataFrame with ordinal values as strings
data = {'OrdinalString': ['Low', 'Medium', 'High', 'Low']}
df = pd.DataFrame(data)

In [63]:
df

Unnamed: 0,OrdinalString
0,Low
1,Medium
2,High
3,Low


In [64]:
# Calculate Gower matrix
gower_distances = gower_matrix(df)

In [65]:
gower_distances

array([[0., 1., 1., 0.],
       [1., 0., 1., 1.],
       [1., 1., 0., 1.],
       [0., 1., 1., 0.]], dtype=float32)

#### Example 03 
- after applying ordinal encoder

In [81]:
# Example DataFrame with ordinal values as strings
data = {'OrdinalString': ['Low', 'Medium', 'High', 'Low']}
df = pd.DataFrame(data)

In [82]:
df

Unnamed: 0,OrdinalString
0,Low
1,Medium
2,High
3,Low


In [83]:
# Explicit encoding mapping
ordinal_mapping = {'Low': 2, 'Medium': 4, 'High': 6}

In [84]:
# Apply explicit encoding and cast to float
df['OrdinalEncoded'] = df['OrdinalString'].map(ordinal_mapping).astype(float)
df

Unnamed: 0,OrdinalString,OrdinalEncoded
0,Low,2.0
1,Medium,4.0
2,High,6.0
3,Low,2.0


In [85]:
# Drop the original string column if needed
df = df.drop(columns=['OrdinalString'])

In [86]:
# Calculate Gower matrix
gower_distances = gower_matrix(df)
gower_distances

array([[0. , 0.5, 1. , 0. ],
       [0.5, 0. , 0.5, 0.5],
       [1. , 0.5, 0. , 1. ],
       [0. , 0.5, 1. , 0. ]], dtype=float32)

#### Example 04 
- nominal data

In [87]:
# Given city names
city_names = ['Kolkata', 'Chennai', 'Mumbai', 'Delhi']

In [92]:
# Generate 20 random samples from the city names
np.random.seed(42)  # for reproducibility
random_samples = np.random.choice(city_names, size=10)

In [93]:
# Create a DataFrame
data = {'City': random_samples}
df = pd.DataFrame(data)
df

Unnamed: 0,City
0,Mumbai
1,Delhi
2,Kolkata
3,Mumbai
4,Mumbai
5,Delhi
6,Kolkata
7,Kolkata
8,Mumbai
9,Chennai


In [94]:
# Calculate Gower matrix
gower_distances = gower_matrix(df)

In [95]:
gower_distances

array([[0., 1., 1., 0., 0., 1., 1., 1., 0., 1.],
       [1., 0., 1., 1., 1., 0., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 0., 0., 1., 1.],
       [0., 1., 1., 0., 0., 1., 1., 1., 0., 1.],
       [0., 1., 1., 0., 0., 1., 1., 1., 0., 1.],
       [1., 0., 1., 1., 1., 0., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 0., 0., 1., 1.],
       [1., 1., 0., 1., 1., 1., 0., 0., 1., 1.],
       [0., 1., 1., 0., 0., 1., 1., 1., 0., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]], dtype=float32)

#### Example 05

In [68]:
import pandas as pd
import numpy as np
#from sklearn.neighbors import DistanceMetric
from sklearn.metrics import DistanceMetric

from sklearn.preprocessing import LabelEncoder

from gower import gower_matrix  # Import gower_matrix from the gower library

In [39]:
# Create a DistanceMetric instance for Euclidean distance
metric = DistanceMetric.get_metric('euclidean')

In [40]:
# Example data
X = [[0, 0], 
     [1, 1], 
     [2, 2]]

In [41]:
# Calculate pairwise Euclidean distances
distances = metric.pairwise(X)

print("Pairwise Euclidean Distances:")
print(distances)

Pairwise Euclidean Distances:
[[0.         1.41421356 2.82842712]
 [1.41421356 0.         1.41421356]
 [2.82842712 1.41421356 0.        ]]


In [42]:
# Define a function to calculate Gower distance between two data points
def gower_distance(X1, X2):
    metric = DistanceMetric.get_metric('gower')
    return metric.pairwise(X1, X2)

In [43]:
# Define categorical columns
colors = ['Red',   'Black',      'Blue']
sizes =  ['Small', 'Medium',     'Large']
areas =  ['RURAL', 'SEMI-URBAN', 'URBAN']

In [44]:
# Generate random samples
np.random.seed(42)  # For reproducibility
data = {
    'Color': np.random.choice(colors, size=20),
    'Size': np.random.choice(sizes, size=20),
    'Area': np.random.choice(areas, size=20),
    'Numeric1': np.random.rand(20),
    'Numeric2': np.random.rand(20)
}


In [45]:
df = pd.DataFrame(data)

In [46]:
df

Unnamed: 0,Color,Size,Area,Numeric1,Numeric2
0,Blue,Small,RURAL,0.90932,0.356753
1,Red,Small,URBAN,0.25878,0.280935
2,Blue,Medium,URBAN,0.662522,0.542696
3,Blue,Medium,RURAL,0.311711,0.140924
4,Red,Small,RURAL,0.520068,0.802197
5,Red,Small,URBAN,0.54671,0.074551
6,Blue,Small,SEMI-URBAN,0.184854,0.986887
7,Black,Large,RURAL,0.969585,0.772245
8,Blue,Large,SEMI-URBAN,0.775133,0.198716
9,Blue,Large,SEMI-URBAN,0.939499,0.005522


In [47]:
# Separate categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols   = df.select_dtypes(include=['float64']).columns

In [48]:
# Apply label encoding to categorical columns
label_encoder = LabelEncoder()
df_encoded    = df.copy()


In [49]:
df_encoded

Unnamed: 0,Color,Size,Area,Numeric1,Numeric2
0,Blue,Small,RURAL,0.90932,0.356753
1,Red,Small,URBAN,0.25878,0.280935
2,Blue,Medium,URBAN,0.662522,0.542696
3,Blue,Medium,RURAL,0.311711,0.140924
4,Red,Small,RURAL,0.520068,0.802197
5,Red,Small,URBAN,0.54671,0.074551
6,Blue,Small,SEMI-URBAN,0.184854,0.986887
7,Black,Large,RURAL,0.969585,0.772245
8,Blue,Large,SEMI-URBAN,0.775133,0.198716
9,Blue,Large,SEMI-URBAN,0.939499,0.005522


In [50]:
df_encoded[categorical_cols] = df[categorical_cols].apply(label_encoder.fit_transform)

In [51]:
df_encoded

Unnamed: 0,Color,Size,Area,Numeric1,Numeric2
0,1,2,0,0.90932,0.356753
1,2,2,2,0.25878,0.280935
2,1,1,2,0.662522,0.542696
3,1,1,0,0.311711,0.140924
4,2,2,0,0.520068,0.802197
5,2,2,2,0.54671,0.074551
6,1,2,1,0.184854,0.986887
7,0,0,0,0.969585,0.772245
8,1,0,1,0.775133,0.198716
9,1,0,1,0.939499,0.005522


In [52]:
# Combine numerical and label-encoded categorical columns for Gower distance calculation
X_gower = pd.concat([df_encoded[categorical_cols], df[numerical_cols]], axis=1)

In [53]:
X_gower

Unnamed: 0,Color,Size,Area,Numeric1,Numeric2
0,1,2,0,0.90932,0.356753
1,2,2,2,0.25878,0.280935
2,1,1,2,0.662522,0.542696
3,1,1,0,0.311711,0.140924
4,2,2,0,0.520068,0.802197
5,2,2,2,0.54671,0.074551
6,1,2,1,0.184854,0.986887
7,0,0,0,0.969585,0.772245
8,1,0,1,0.775133,0.198716
9,1,0,1,0.939499,0.005522


In [54]:
# Calculate pairwise Gower distances using gower_matrix
gower_distances = gower_matrix(X_gower)

In [55]:
# Convert distances to a DataFrame for better visualization
gower_distances_df = pd.DataFrame(gower_distances, columns=df.index, index=df.index)


In [56]:
gower_distances_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.456207,0.391294,0.273288,0.275002,0.435969,0.38517,0.397715,0.361241,0.37811,0.29662,0.338731,0.378581,0.362077,0.611958,0.58731,0.675448,0.515843,0.492357,0.522705
1,0.456207,0.0,0.340703,0.439986,0.362766,0.104359,0.359867,0.853922,0.528477,0.603413,0.546555,0.660176,0.434788,0.536774,0.555751,0.162006,0.448039,0.74675,0.272492,0.533502
2,0.391294,0.340703,0.0,0.357784,0.483708,0.320465,0.393876,0.51322,0.294468,0.369403,0.205852,0.347438,0.294085,0.370784,0.496454,0.271108,0.359943,0.524549,0.301063,0.279127
3,0.273288,0.439986,0.357784,0.0,0.379847,0.464373,0.399853,0.471004,0.312047,0.363427,0.363636,0.277258,0.451869,0.17676,0.33867,0.401993,0.408053,0.363831,0.50704,0.550583
4,0.275002,0.362766,0.483708,0.379847,0.0,0.354057,0.310168,0.503365,0.578176,0.653111,0.383789,0.33627,0.301853,0.299681,0.718517,0.493171,0.782007,0.440841,0.490274,0.762835
5,0.435969,0.104359,0.320465,0.464373,0.354057,0.0,0.464226,0.833684,0.474728,0.499054,0.526317,0.639938,0.41455,0.641133,0.575989,0.266365,0.45632,0.794898,0.371412,0.513264
6,0.38517,0.359867,0.393876,0.399853,0.310168,0.464226,0.0,0.613533,0.488344,0.56328,0.288551,0.446439,0.412022,0.264792,0.488443,0.458282,0.607906,0.469327,0.292813,0.673004
7,0.397715,0.853922,0.51322,0.471004,0.503365,0.833684,0.613533,0.0,0.358957,0.362766,0.324982,0.193746,0.419135,0.390837,0.409673,0.784327,0.473163,0.144206,0.58143,0.32042
8,0.361241,0.528477,0.294468,0.312047,0.578176,0.474728,0.488344,0.358957,0.0,0.074936,0.251589,0.241905,0.339822,0.465251,0.250716,0.490484,0.314206,0.419017,0.595531,0.238537
9,0.37811,0.603413,0.369403,0.363427,0.653111,0.499054,0.56328,0.362766,0.074936,0.0,0.274729,0.316841,0.351258,0.540187,0.274837,0.56542,0.355374,0.493953,0.670467,0.290276
