# Gower Distance

### Import packages and data

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import DistanceMetric
import warnings
warnings.simplefilter("ignore")
import scipy.cluster.hierarchy as shc
%matplotlib inline

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, homogeneity_completeness_v_measure

import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
from scipy.spatial import distance

In [6]:
import pyarrow.parquet as pq

table = pq.read_table("Data/customer_data.parquet")
# Optionally convert to Pandas DataFrame
data = table.to_pandas()

In [9]:
data = data[0:1000]

### Gower Distance

In [10]:
def gower_distance(X):
    
    #Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    #Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    
    individual_variable_distances = []

    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)

        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)

In [11]:
# Rename data to X
X=data

In [12]:
# Store the output of function gower_distance for X as dist 
dist=gower_distance(X)

In [13]:
dist.shape

(1000, 1000)

In [14]:
data.shape

(1000, 4)

In [15]:
dist1=pd.DataFrame(dist)

In [16]:
dist1.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.278378,0.27027,0.256757,0.278378,0.278378,0.272973,0.271622,0.277027,0.277027,...,0.547188,0.529621,0.547188,0.533675,0.547188,0.547188,0.545837,0.565998,0.565998,0.56735
1,0.278378,0.0,0.258108,0.271622,0.25,0.25,0.255405,0.256757,0.251351,0.251351,...,0.521513,0.53908,0.521513,0.535026,0.521513,0.521513,0.522864,0.543025,0.543025,0.541674
2,0.27027,0.258108,0.0,0.263514,0.258108,0.258108,0.252703,0.251351,0.256757,0.256757,...,0.526918,0.530972,0.526918,0.526918,0.526918,0.526918,0.525567,0.545728,0.545728,0.547079
3,0.256757,0.271622,0.263514,0.0,0.271622,0.271622,0.266216,0.264865,0.27027,0.27027,...,0.540432,0.522864,0.540432,0.526918,0.540432,0.540432,0.53908,0.559241,0.559241,0.560593
4,0.278378,0.25,0.258108,0.271622,0.0,0.25,0.255405,0.256757,0.251351,0.251351,...,0.521513,0.53908,0.521513,0.535026,0.521513,0.521513,0.522864,0.543025,0.543025,0.541674


In [17]:
data.isnull().values.any(axis=0)

array([False, False, False, False])

In [18]:
dist1=dist1.fillna(0)

In [19]:
dist1.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.278378,0.27027,0.256757,0.278378,0.278378,0.272973,0.271622,0.277027,0.277027,...,0.547188,0.529621,0.547188,0.533675,0.547188,0.547188,0.545837,0.565998,0.565998,0.56735
1,0.278378,0.0,0.258108,0.271622,0.25,0.25,0.255405,0.256757,0.251351,0.251351,...,0.521513,0.53908,0.521513,0.535026,0.521513,0.521513,0.522864,0.543025,0.543025,0.541674
2,0.27027,0.258108,0.0,0.263514,0.258108,0.258108,0.252703,0.251351,0.256757,0.256757,...,0.526918,0.530972,0.526918,0.526918,0.526918,0.526918,0.525567,0.545728,0.545728,0.547079
3,0.256757,0.271622,0.263514,0.0,0.271622,0.271622,0.266216,0.264865,0.27027,0.27027,...,0.540432,0.522864,0.540432,0.526918,0.540432,0.540432,0.53908,0.559241,0.559241,0.560593
4,0.278378,0.25,0.258108,0.271622,0.0,0.25,0.255405,0.256757,0.251351,0.251351,...,0.521513,0.53908,0.521513,0.535026,0.521513,0.521513,0.522864,0.543025,0.543025,0.541674
