# Exam Data Mining

### Prof. Acuña
###  Francisco Diaz



##### Question 1 HVDM Distance Implementation

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import pdist, squareform
import numbers

#### Reading and cleaning data

In [2]:
# Paths and datasets

path = os.getcwd()
datasetpath = os.path.join(path,"data")
flagdataset = os.path.join(datasetpath,"flag2.csv")
zoodataset = os.path.join(datasetpath,"zoo1.csv")


# Data Set to DataFrame
flagdf = pd.read_csv(flagdataset)
zoodf = pd.read_csv(zoodataset)

In [3]:
zoodf

Unnamed: 0,animal name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
6,calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,1
7,carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,4
8,catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
9,cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,1


In [4]:
flagdf.head()

Unnamed: 0,names,landmass,zone,area,population,language,religion,bars,stripes,colors,...,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright,Unnamed: 29
0,Afghanistan,5,1,648,16,10,2,0,3,5,...,0,0,1,0,0,1,0,0,black,green
1,Albani,3,1,29,3,6,6,0,0,3,...,0,0,1,0,0,0,1,0,red,red
2,Algeria,4,1,2388,20,8,2,2,0,3,...,0,0,1,1,0,0,0,0,green,white
3,American-Samoa,6,3,0,0,1,1,0,0,5,...,0,0,0,0,1,1,1,0,blue,red
4,Andorra,3,1,0,0,6,0,3,0,3,...,0,0,0,0,0,0,0,0,blue,red


### HVDM Distance Function

In [5]:
# The number of instances in the training set T that have value "x" for attribute "a" and output class "c"
# params:
# df:DataFrame
# x:Value
# a:Attribute
# c:Class



def N(df,a,x,c,typeatr):
    count = 0
    if(a!=typeatr):
        if(c == "all"):
            count = df[a][df[a]==x].shape[0]
        else:
            #filtering by class value
            dfN = df.loc[df[typeatr] == c]
            count = dfN[dfN[a] == x].shape[0]
    return count



# Function for the normalized vdm
# args:
# df : DataFrame
# a : Attribute
# x: Value
# y: Value
# lc: List of Output Classes
# n: Use either N1 or N2 normalization
def normalized_vdm(df,a,x,y,lc,n):
    result = 0
    Nax = N(df,a,x,"all","type")
    Nay = N(df,a,y,"all","type")
    if(Nax==0 or Nay==0):
        return result
    if(n==1):
        for c in lc:
            Naxc = N(df,a,x,c,"type")
            Nayc = N(df,a,y,c,"type")
            result += abs((Naxc/Nax)-(Nayc/Nay))
    elif(n==2):
        for c in lc:
            Naxc = N(df,a,x,c,"type")
            Nayc = N(df,a,y,c,"type")
            result += pow(abs((Naxc/Nax)-(Nayc/Nay)),2)
        result = math.sqrt(result)
        
    return result



# args:
# df : DataFrame
# x: Vector 
# y: Vector 
# n: Use either N1 or N2 normalization
# lc: List of Output Classes
def HVDM(df,x,y,n,lc):
    result =0
    for i,a in enumerate(df):
        result += pow(normalized_vdm(df,a,x[i],y[i],lc,1),2)
    return math.sqrt(result)





#### Testing HVDM Distance Function

In [6]:
lc = [1,2,3,4,5,6,7] # List of Output Classes for the zoo dataset
# Testing Normalized method
normalized_vdm(zoodf,"aquatic",0,1,lc,1)


1.1547008547008548

In [7]:
zoodrop = zoodf.drop(['animal name'],axis=1)
x = np.array(zoodrop.iloc[19]) # Dolphin
y = np.array(zoodrop.iloc[1]) # Antelope
HVDM(zoodrop,x,y,1,lc)
#np.array(zoodrop.iloc[0])

3.1047625576912403

In [8]:
# Just to verify it should be 0.0 since we are choosing the same points
x = np.array(zoodrop.iloc[1]) # Antelope
y = np.array(zoodrop.iloc[1]) # Antelope
HVDM(zoodrop,x,y,1,lc)

0.0

In [9]:
x = np.array(zoodrop.iloc[81]) # Slug
y = np.array(zoodrop.iloc[1]) # Antelope
HVDM(zoodrop,x,y,1,lc)

4.932345069728164

## The Gower Distance Function

#### Gower Distance Code

In [24]:
#Normalize the array
def normalize_mixed_data_columns(arr, dtypes):
  
    if isinstance(arr,pd.DataFrame):
       arr =np.asmatrix(arr.copy())
    elif isinstance(arr,np.ndarray):
       arr =arr.copy()
    else:
       raise ValueError('A DataFrame or ndarray must be provided.')
    rows,cols = arr.shape
    for col in range(cols):
        if np.issubdtype(dtypes[col],np.number):
            max = arr[:,col].max()+0.0  #Converts it to double
            if (cols>1):
                arr[:,col] = arr[:,col] /max
                
            else:    
                arr= arr/max
    return( arr)

#This is to obtain the range (max-min) values of each numeric column
def calc_range_mixed_data_columns(arr, dtypes):
    rows,cols = arr.shape
    
    result = np.zeros(cols)
    for col in range(cols):
        if np.issubdtype(dtypes[col],np.number):
            result[col]= arr[:,col].max()-arr[:,col].min()
    return( result)


#This function was copied from pdist because it is private. No change in the original function.
def _validate_vector(u, dtype=None):
    # XXX Is order='c' really necessary?
    u = np.asarray(u, dtype=dtype, order='c').squeeze()
    # Ensure values such as u=1 and u=[1] still return 1-D arrays.
    u = np.atleast_1d(u)
    if u.ndim > 1:
        raise ValueError("Input vector should be 1-D.")
    return u



def gower(xi, xj,V=None,w=None,VI=None):
    cols = len(xj)
    
    xi=_validate_vector(xi)
    xj=_validate_vector(xj)

    if V is None:
        raise ValueError('An array with the (max-min) ranges for each numeric column must be passed in V.')

    if VI is None:
        raise ValueError('An array with the dtypes or each numeric column must be passed in VI.')

    if w is None:
        w=[1]*cols
    
    sum_sij =0.0
    sum_wij =0.0
    for col in range(cols):
        sij=0.0
        wij=0.0
        
        if np.issubdtype(VI[col], np.number):
            sij=abs(xi[col]-xj[col])/(V[col])
            wij=(w[col],0)[pd.isnull(xi[col]) or pd.isnull(xj[col])]
            
        else:
            sij=(1,0)[xi[col]==xj[col]]
            wij=(w[col],0)[pd.isnull(xi[col]) and pd.isnull(xj[col])]
        
        sum_sij+= (wij*sij)
        sum_wij+=wij

    
    return(sum_sij/sum_wij)

#### Testing Gower Distance

In [25]:
zoodropdf = zoodf.drop(["animal name","type"],axis=1)
zoodropdf

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1
5,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1
6,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1
7,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0
8,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0
9,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0


In [26]:
print(zoodropdf.dtypes)


#It's necessary to obtain the columns dtypes
dtypes = zoodropdf.dtypes
#It's necessary to normalize between 0 and 1
zoodropdfnm=normalize_mixed_data_columns(zoodropdf,dtypes)

#It's necessary to obtain the range (max-min) values of each numeric column
ranges=calc_range_mixed_data_columns(zoodropdfnm,dtypes)

hair        int64
feathers    int64
eggs        int64
milk        int64
airborne    int64
aquatic     int64
predator    int64
toothed     int64
backbone    int64
breathes    int64
venomous    int64
fins        int64
legs        int64
tail        int64
domestic    int64
catsize     int64
dtype: object


In [29]:
x = np.array(zoodropdf.iloc[81]) # Slug
y = np.array(zoodropdf.iloc[1]) # Antelope
print(x)
gower(x,y,V=ranges,VI=dtypes)

[0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0]


0.6875

In [30]:
x = np.array(zoodropdf.iloc[19]) # Dolphin
y = np.array(zoodropdf.iloc[1]) # Antelope
print(x)
gower(x,y,V=ranges,VI=dtypes)

[0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1]


0.5

In [31]:
#Should be 0 (same point)
x = np.array(zoodropdf.iloc[1]) # Antelope
y = np.array(zoodropdf.iloc[1]) # Antelope
print(x)
gower(x,y,V=ranges,VI=dtypes)

[1 0 0 1 0 0 0 1 1 1 0 0 4 1 0 1]


0.0

# KNN

In [73]:

from statistics import mode

#params:
# trainingdata: Data used for training
# labels: The labels of each instance of the training data
# point: the vector to predict
# distance: The distance function
# K: the number of neighbors to consider (Default is 3)


def knn(trainingdata,labels,point,distance, k=3):
    distances = [] # for each distance calculated there will be a corresponding label
    if(distance == "gower"):
        
        #It's necessary to obtain the columns dtypes
        dtypes = trainingdata.dtypes
        #It's necessary to normalize between 0 and 1
        trainingdropdfnm=normalize_mixed_data_columns(trainingdata,dtypes)

        #It's necessary to obtain the range (max-min) values of each numeric column
        ranges=calc_range_mixed_data_columns(trainingdropdfnm,dtypes)
        
        
        #for every row calculate the distance between it and a point, add them to distance list
        for i in range(trainingdata.shape[0]):
            x =  np.array(trainingdata.iloc[i])
            d = gower(x,point,V=ranges,VI=dtypes)
            distances.append((d,labels[i]))
            distances.sort()
            
    elif(distance =="HVDM"):
        for i in range(trainingdata.shape[0]):
            x = np.array(trainingdata.iloc[i])
            lc = list(set(labels))
            d = HVDM(trainingdata,x,mypoint,1,lc)
            distances.append((d,labels[i]))
            distances.sort()
            
    return distances[:k]

    
    
    
    

## Testing KNN Gower

In [74]:
labels = zoodf["type"]
#len(np.array(labels))
mypoint = np.array(zoodf.drop(["animal name","type"],axis=1).iloc[7])
trainingdropdf = zoodf.drop(["animal name","type"],axis=1)
knn(trainingdropdf,labels,mypoint,distance="gower")

[(0.0, 4), (0.0625, 4), (0.0625, 4)]

## Testing KNN HVDM

In [75]:
labels = zoodf["type"]
trainingdata= zoodf.drop(['animal name'],axis=1)
mypoint = np.array(zoodf.drop(["animal name"],axis=1).iloc[7])
knn(trainingdata,labels,mypoint,distance="HVDM")

[(0.0, 4), (0.5559440559440559, 4), (0.5559440559440559, 4)]

### Obtaining Percentages for comparison