In [185]:
# Import the libraries we will be using
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy.spatial import distance
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.model_selection import cross_val_score
from collections import defaultdict

import sys
sys.path.append("..")

sns.set(font_scale=1.5)
pd.set_option('display.max_rows', 50)

In [186]:
# Import the ratings dataset
df = pd.read_csv('./scotch.csv')
df.head()

Unnamed: 0,NAME,wyne,yellow,v.pale,pale,p.gold,gold,o.gold,f.gold,bronze,...,smoke.1,sweet.1,spice.1,oil.1,salt.1,arome.1,ling,long,very,quick
0,Aberfeldy,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,Aberlour,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,Ardberg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,Ardmore,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Auchentoshan,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [187]:
#data values only
X = df.iloc[:,1:]
X.head(5)

Unnamed: 0,wyne,yellow,v.pale,pale,p.gold,gold,o.gold,f.gold,bronze,p.amber,...,smoke.1,sweet.1,spice.1,oil.1,salt.1,arome.1,ling,long,very,quick
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [188]:
#Whisky NAME
y = df.iloc[:,0:1]
y.head(5)

Unnamed: 0,NAME
0,Aberfeldy
1,Aberlour
2,Ardberg
3,Ardmore
4,Auchentoshan


In [189]:
df.dtypes

NAME       object
wyne        int64
yellow      int64
v.pale      int64
pale        int64
            ...  
arome.1     int64
ling        int64
long        int64
very        int64
quick       int64
Length: 69, dtype: object

In [190]:

def distances(df, target_whisky, dist_1, dist_2):
    y_new = pd.DataFrame()
    whisky = df.loc[df['NAME'] == target_whisky].iloc[:,1:]

    X = df.iloc[:,1:]
    y = df.iloc[:,0:1]
    
    y_new[[dist_1]] = distance.cdist(X, whisky, dist_1)
    y_new[[dist_2]] = distance.cdist(X, whisky, dist_2)
    y_new.insert(0, "NAME", y)

    return y_new

whisky_dists = distances(df, "Bunnahabhain", 'euclidean', 'jaccard')
whisky_dists.sort_values(by='euclidean', ascending=True).head(10)


Unnamed: 0,NAME,euclidean,jaccard
19,Bunnahabhain,0.0,0.0
48,Glenglassaugh,3.0,0.642857
2,Ardberg,3.162278,0.666667
18,Bruichladdich,3.162278,0.666667
108,Tullibardine,3.316625,0.647059
21,Caperdonich,3.464102,0.75
32,Deanston,3.464102,0.75
16,Bowmore,3.464102,0.75
43,Glen Elgin,3.464102,0.705882
11,Benriach,3.464102,0.75


In [191]:
def distances2csv(df, of, target_whisky, dist_1, dist_2):
    try:
        whisky = df.loc[df['NAME'] == target_whisky].iloc[:,1:]
    except:
        print("Couldn't find whisky: ", target_whisky)
        return 1
    
    new = df
    
    #X whisky data
    X = df.iloc[:,1:]
    #y whisky names
    y = df.iloc[:,0:1]
    
    new[[dist_1]] = distance.cdist(X, whisky, dist_1)
    new[[dist_2]] = distance.cdist(X, whisky, dist_2)
    outfile = open(of,'w')
    #outfile.write(str(new))
    new.to_csv(outfile, lineterminator='\n', index=False)
    outfile.close()
    return new
    
distances2csv(df, 'whisky_dist.csv', "Bunnahabhain", 'euclidean', 'jaccard').head(50)

Unnamed: 0,NAME,wyne,yellow,v.pale,pale,p.gold,gold,o.gold,f.gold,bronze,...,spice.1,oil.1,salt.1,arome.1,ling,long,very,quick,euclidean,jaccard
0,Aberfeldy,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,4.242641,0.9
1,Aberlour,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,4.358899,0.863636
2,Ardberg,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,3.162278,0.666667
3,Ardmore,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4.123106,0.944444
4,Auchentoshan,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,3.741657,0.823529
5,Aultmore,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.464102,0.8
6,Balblair,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,3.464102,0.75
7,Balmenach,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,4.123106,0.85
8,Balvenie,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,3.872983,0.882353
9,Banff,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,3.605551,0.8125
