#### Import library dan tools yang diperlukan

In [1]:
import numpy as np
import pandas as pd
import math 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from scipy.spatial.distance import pdist, squareform

#### Mengambil iris dataset dari UCI Machine Learning Repository

In [3]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
                  names=['sepal length', 'sepal width', 'petal length', 'petal width', 'class'])

#### Melihat jumlah data dan tipe datanya

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal length    150 non-null float64
sepal width     150 non-null float64
petal length    150 non-null float64
petal width     150 non-null float64
class           150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


#### Mengambil hanya 25 sample dari 150 data

In [5]:
df = df.sample(25)

In [6]:
df['class'].unique()

array(['Iris-virginica', 'Iris-setosa', 'Iris-versicolor'], dtype=object)

#### Mengubah atribut class menjadi:
- Iris-setosa = 0
- Iris-versicolor = 1
- Iris-virginica = 2

In [7]:
df = df.replace({'class' : {'Iris-setosa' : 1,
                       'Iris-versicolor' : 2,
                       'Iris-virginica' : 3}})

In [8]:
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
118,7.7,2.6,6.9,2.3,3
126,6.2,2.8,4.8,1.8,3
27,5.2,3.5,1.5,0.2,1
33,5.5,4.2,1.4,0.2,1
64,5.6,2.9,3.6,1.3,2


#### Membuat fungsi normalisasi untuk menghitung proximity measure dari atribut ordinal

In [9]:
def normalize(x):
    x_normalize = []
    for i in x:
        i = (i - min(x)) / (max(x) - min(x))
        x_normalize.append(i)
    
    return x_normalize

In [10]:
df['class'] = normalize(df['class'])

#### Menulis ke dalam excel data yang telah dinormalisasi

In [11]:
df.to_excel('data-after-normalization.xlsx')

In [12]:
euclidean_distances = pdist(df, metric='euclidean')
euc_matrix = squareform(euclidean_distances)
euc_matrix = pd.DataFrame(euc_matrix)

In [13]:
manhattan_distances = pdist(df, metric='cityblock')
manhattan_matrix = squareform(manhattan_distances)
manhattan_matrix = pd.DataFrame(manhattan_matrix)

In [14]:
minkowski_distances = pdist(df, metric='minkowski', p=5)
minkowski_matrix = squareform(minkowski_distances)
minkowski_matrix = pd.DataFrame(minkowski_matrix)

In [15]:
supremum_distance = pdist(df, metric='chebyshev')
supremum_matrix = squareform(supremum_distance)
supremum_matrix = pd.DataFrame(supremum_matrix)

#### Menulis hasil ke dalam file excel

In [16]:
with pd.ExcelWriter('dissimilarity-matrix.xlsx') as writer:
    euc_matrix.to_excel(writer, sheet_name='Euclidean Distance')
    manhattan_matrix.to_excel(writer, sheet_name='Manhattan Distance')
    minkowski_matrix.to_excel(writer, sheet_name='Minkowski Distance')
    supremum_matrix.to_excel(writer, sheet_name='Supremum Distance')