In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Define categorical columns
colors = ['Red', 'Black', 'Blue']
sizes = ['Small', 'Medium', 'Large']
areas = ['RURAL', 'SEMI-URBAN', 'URBAN']

In [3]:
# Generate random samples
np.random.seed(42)  # For reproducibility
data = {
    'Color': np.random.choice(colors, size=5),
    'Size': np.random.choice(sizes, size=5),
    'Area': np.random.choice(areas, size=5)
}

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,Color,Size,Area
0,Blue,Small,URBAN
1,Red,Large,URBAN
2,Blue,Medium,RURAL
3,Blue,Large,URBAN
4,Red,Large,SEMI-URBAN


In [6]:
# Apply one-hot encoding
encoder      = OneHotEncoder(drop='first', sparse=False)
encoded_data = encoder.fit_transform(df)

In [7]:
# Convert the encoded data to a DataFrame for better visualization
encoded_df = pd.DataFrame(encoded_data, 
                          columns=encoder.get_feature_names_out(df.columns))
encoded_df

Unnamed: 0,Color_Red,Size_Medium,Size_Small,Area_SEMI-URBAN,Area_URBAN
0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0


In [8]:
# Calculate pairwise Euclidean distances
distances = euclidean_distances(encoded_data, encoded_data)

In [9]:
# Convert distances to a DataFrame for better visualization
distances_df = pd.DataFrame(distances, columns=df.index, index=df.index)

In [10]:
# Display the pairwise Euclidean distances
print("\nPairwise Euclidean Distances:")
print(distances_df)


Pairwise Euclidean Distances:
          0         1         2         3         4
0  0.000000  1.414214  1.732051  1.000000  2.000000
1  1.414214  0.000000  1.732051  1.000000  1.414214
2  1.732051  1.732051  0.000000  1.414214  1.732051
3  1.000000  1.000000  1.414214  0.000000  1.732051
4  2.000000  1.414214  1.732051  1.732051  0.000000


#### Label encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
# Apply label encoding
label_encoder = LabelEncoder()
df_encoded = df.apply(label_encoder.fit_transform)

In [13]:
df_encoded

Unnamed: 0,Color,Size,Area
0,0,2,2
1,1,0,2
2,0,1,0
3,0,0,2
4,1,0,1


In [14]:
# Calculate pairwise Euclidean distances
distances = euclidean_distances(df_encoded, df_encoded)

In [15]:
# Convert distances to a DataFrame for better visualization
distances_df = pd.DataFrame(distances, columns=df.index, index=df.index)


In [16]:
distances_df

Unnamed: 0,0,1,2,3,4
0,0.0,2.236068,2.236068,2.0,2.44949
1,2.236068,0.0,2.44949,1.0,1.0
2,2.236068,2.44949,0.0,2.236068,1.732051
3,2.0,1.0,2.236068,0.0,1.414214
4,2.44949,1.0,1.732051,1.414214,0.0
