In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [2]:
url = "https://raw.githubusercontent.com/dwightvj/PIC16B-Project/main/dogs.csv"

In [3]:
df = pd.read_csv(url)
df = df.fillna(0)

In [4]:
df

Unnamed: 0,id,name,size,kidFriendly,dogFriendly,lowShedding,easyToGroom,highEnergy,goodHealth,lowBarking,intelligence,easyToTrain,toleratesHot,toleratesCold
0,affenpinscher,Affenpinscher,1,1,1,5,3,4,4,4.0,4,2,3,3
1,afghan-hound,Afghan Hound,4,5,5,2,1,5,3,4.0,4,1,5,5
2,airedale-terrier,Airedale Terrier,3,4,4,4,2,5,3,2.0,5,4,3,3
3,akita,Akita,4,1,1,1,1,4,4,1.0,3,2,2,5
4,alaskan-malamute,Alaskan Malamute,4,3,3,1,1,5,4,1.0,4,4,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,whippet,Whippet,3,5,5,4,5,5,4,5.0,4,4,4,1
195,wirehaired-pointing-griffon,Wirehaired Pointing Griffon,3,5,5,5,3,5,4,2.0,5,5,3,4
196,xoloitzuintli,Xoloitzcuintli,3,3,3,1,5,3,5,1.0,5,3,3,3
197,yorkipoo,Yorkipoo,1,4,4,5,4,5,3,1.0,4,4,3,2


In [5]:
# obtain variance for each column
colvar = []
for column in range(2, 14):
    colvar.append(np.var(df.iloc[:, column]))
    print(df.columns[column], np.var(df.iloc[:, column]))

size 1.3441579758086917
kidFriendly 1.0830534582460039
dogFriendly 1.0830534582460039
lowShedding 1.5943536779374259
easyToGroom 2.0993409257341984
highEnergy 0.9224514532461303
goodHealth 1.148455847074569
lowBarking 1.5708694224893311
intelligence 0.5563495871316381
easyToTrain 1.3139062144895333
toleratesHot 0.881896921794904
toleratesCold 1.4688517966718015


In [6]:
# only choose top 6 columns of greatest variance, drop the rest
df = df.drop(["id", "dogFriendly", "kidFriendly", "highEnergy",
        "intelligence", "toleratesHot", "toleratesCold"], axis = 1)

In [7]:
df

Unnamed: 0,name,size,lowShedding,easyToGroom,goodHealth,lowBarking,easyToTrain
0,Affenpinscher,1,5,3,4,4.0,2
1,Afghan Hound,4,2,1,3,4.0,1
2,Airedale Terrier,3,4,2,3,2.0,4
3,Akita,4,1,1,4,1.0,2
4,Alaskan Malamute,4,1,1,4,1.0,4
...,...,...,...,...,...,...,...
194,Whippet,3,4,5,4,5.0,4
195,Wirehaired Pointing Griffon,3,5,3,4,2.0,5
196,Xoloitzcuintli,3,1,5,5,1.0,3
197,Yorkipoo,1,5,4,3,1.0,4


In [8]:
# create new vector column for dogs' attributes
df['list'] = df[['size', 'lowShedding', 'easyToGroom',
                'goodHealth', 'lowBarking', 'easyToTrain']].values.tolist()

In [9]:
df

Unnamed: 0,name,size,lowShedding,easyToGroom,goodHealth,lowBarking,easyToTrain,list
0,Affenpinscher,1,5,3,4,4.0,2,"[1.0, 5.0, 3.0, 4.0, 4.0, 2.0]"
1,Afghan Hound,4,2,1,3,4.0,1,"[4.0, 2.0, 1.0, 3.0, 4.0, 1.0]"
2,Airedale Terrier,3,4,2,3,2.0,4,"[3.0, 4.0, 2.0, 3.0, 2.0, 4.0]"
3,Akita,4,1,1,4,1.0,2,"[4.0, 1.0, 1.0, 4.0, 1.0, 2.0]"
4,Alaskan Malamute,4,1,1,4,1.0,4,"[4.0, 1.0, 1.0, 4.0, 1.0, 4.0]"
...,...,...,...,...,...,...,...,...
194,Whippet,3,4,5,4,5.0,4,"[3.0, 4.0, 5.0, 4.0, 5.0, 4.0]"
195,Wirehaired Pointing Griffon,3,5,3,4,2.0,5,"[3.0, 5.0, 3.0, 4.0, 2.0, 5.0]"
196,Xoloitzcuintli,3,1,5,5,1.0,3,"[3.0, 1.0, 5.0, 5.0, 1.0, 3.0]"
197,Yorkipoo,1,5,4,3,1.0,4,"[1.0, 5.0, 4.0, 3.0, 1.0, 4.0]"


In [10]:
from scipy import spatial

#create list of the behavior attributes lists
breeds = df['list'].tolist()
#create KDTree based on these breeds
tree = spatial.KDTree(breeds)


In [11]:
#recommend breed that is the "nearest neighbor" to input
def breedrec(l):
    #calculate nearest neighbor to l
    dog = breeds[tree.query(l)[1]]

    #find the index of this vector of attributes
    index = breeds.index(dog)
    #get breed name based on index
    name = df.iloc[index, 0]
    return name

In [12]:
#recommend the top three breeds that are the "nearest neighbor" to input
def top3rec(l):
    #find the indices of the 3 closest vectors to l
    closest_indices = tree.query(l, k = 3)[1]

    #get the vectors of attributes of these 3 indices
    dogs_behav = [breeds[i] for i in closest_indices]

    #find the indices containing these attribute vectors
    indices = [breeds.index(dog) for dog in dogs_behav]
    #get the breed names based on index
    name = [df.iloc[index,0] for index in indices]
    return name

In [13]:
akita = [4.0, 1.0, 1.0, 4.0, 1.0, 2.0]

breedrec(akita)

'Akita'

In [14]:
test = [1, 2, 3, 4, 5, 4]
breedrec(test)

'Cardigan Welsh Corgi'

In [15]:
test2 = [1,1,1,1,1,1]
test3 = [5,5,5,5,5,5]
top3rec(test2), top3rec(test3)

(['Petit Basset Griffon Vendeen', 'Pekingese', 'Dachshund'],
 ['Belgian Malinois', 'Swedish Vallhund', 'Saluki'])

In [16]:
top3rec(akita)

['Akita', 'Korean Jindo Dog', 'Bloodhound']

In [17]:
top3rec(test)

['Cardigan Welsh Corgi', 'Shetland Sheepdog', 'Border Terrier']