In [1]:
from random import random , randint
import math

In [2]:
def wineprice(rating , age):
    peak_age = rating - 50
    
    price = rating / 2
    
    if age>peak_age:
        price = price*(5-(age - peak_age))
    else:
        price = price*(5*((age+1)/peak_age))
    
    if price<0:
        price = 0
    
    return price

In [3]:
def wineset1():
    rows = []
    
    for i in range(300):
        rating = random()*50+50
        age = random()*50
        
        price = wineprice(rating , age)
        
        price *= (random()*0.4 + 0.8)
        
        rows.append({'input':(rating , age) , 'result':price})
    
    return rows

In [4]:
wineprice(95.0 , 3.0)

21.111111111111114

In [5]:
wineprice(95.0 , 8.0)

47.5

In [6]:
wineprice(99.0 , 1.0)

10.102040816326529

In [7]:
data = wineset1()

In [8]:
data[0]

{'input': (73.1002008373895, 24.34426781467684), 'result': 157.1125750504872}

In [9]:
data[1]

{'input': (94.46624134894853, 18.5425653230327), 'result': 96.58976054196349}

In [10]:
len(data)

300

In [11]:
def euclidean(v1 , v2):
    d = 0.0
    
    for i in range(len(v1)):
        d += (v1[i] - v2[i])**2
        
    return math.sqrt(d)

In [12]:
euclidean(data[0]['input'] , data[1]['input'])

22.139725358349224

In [13]:
def getdistances(data , vec1):
    distancelist = []
    
    for i in range(len(data)):
        vec2 = data[i]['input']
        distancelist.append((euclidean(vec1 , vec2) , i))
    
    distancelist.sort()
    
    return distancelist

In [14]:
def knnestimate(data , vec1 , k = 5):
    dlist = getdistances(data , vec1)
    avg = 0.0
    
    for i in range(k):
        idx = dlist[i][1]
        avg += data[idx]['result']
    
    avg = avg/k
    
    return avg

In [15]:
knnestimate(data , (95.0 , 3.0))

25.384177631532395

In [16]:
knnestimate(data , (99.0 , 3.0))

36.01228587009456

In [17]:
knnestimate(data , (99.0 , 5.0))

36.01228587009456

In [18]:
wineprice(99.0 , 5.0)

30.306122448979593

In [19]:
def inverseweight(dist , num = 1.0 , const = 0.1):
    return num/(dist+const)

In [20]:
def subtractweight(dist , const = 1.0):
    if dist>const:
        return 0
    else:
        return const-dist

In [21]:
def gaussion(dist , sigma = 10.0):
    return math.e**(-dist**2/(2*sigma**2))

In [22]:
def weightedknn(data , vec1 , k =5 , weightf = gaussion):
    dlist = getdistances(data  , vec1)
    avg = 0.0
    totalweight = 0.0
    
    for i in range(k):
        dist = dlist[i][0]
        idx = dlist[i][1]
        weight = weightf(dist)
        avg += weight*data[idx]['result']
        totalweight += weight
    
    avg = avg/totalweight
    
    return avg

In [23]:
weightedknn(data , (99.0 , 5.0))

35.984854999321286

In [24]:
wineprice(99.0 , 5.0)

30.306122448979593

In [25]:
def dividedata(data , test = 0.05):
    trainset = []
    testset = []
    
    for row in data:
        if random()<test:
            testset.append(row)
        else:
            trainset.append(row)
    
    return trainset , testset

In [26]:
def testalgorithm(algf , trainset , testset):
    error = 0.0
    
    for row in testset:
        guess = algf(trainset , row['input'])
        error += (row['result'] - guess)**2
    
    return error/len(testset)

In [27]:
def crossvalidate(algf , data , trials = 100 , test = 0.05):
    error = 0.0
    
    for i in range(trials):
        trainset , testset = dividedata(data , test)
        error += testalgorithm(algf , trainset , testset)
    
    return error/trials

In [29]:
crossvalidate(knnestimate , data)

527.0645956121233

In [31]:
def knn3(d , v):
    return knnestimate(d , v ,k=3)

In [32]:
crossvalidate(knn3 , data)

489.9218771229742

In [33]:
crossvalidate(weightedknn , data)

499.597475041258

In [34]:
def knninverse(d , v):
    return weightedknn(d , v , weightf=inverseweight)


In [35]:
crossvalidate(knninverse , data)

484.423765170699

In [36]:
def wineset2():
    rows = []
    
    for i in range(300):
        rating = random()*50+50
        age = random()*50
        
        aisle = float(randint(1,20))
        bottlesize = [375.0 , 750.0 , 1500.0 , 3000.0][randint(0,3)]
        
        price = wineprice(rating , age)
        
        price *= (bottlesize/750)
        price *= (random()*0.9+0.2)
        
        rows.append({'input':{rating , age , aisle , bottlesize},
                     'result':price})
    
    return rows

In [37]:
data = wineset2()

In [39]:
crossvalidate(weightedknn , data)

TypeError: 'set' object does not support indexing

In [40]:
def rescale(data , scale):
    scaleddata = []
    
    for row in data:
        scaled = [scale[i]*row['input'][i] for i in range(len(scale))]
        scaleddata.append({'input':scaled , 'result':row['result']})
        
    return scaleddata

In [41]:
#缩放后进行测试
sdata = rescale(data , [10 , 10 , 0 , 0.5])

TypeError: 'set' object does not support indexing

In [None]:
crossvalidate(knn3 , sdata)

In [42]:
crossvalidate(weightedknn , sdata)

NameError: name 'sdata' is not defined

In [43]:
#使用优化算法找到应该进行缩放的特征
#遗传 爬山等

In [44]:
def wineset3():
    rows = wineset1()
    
    for row in rows:
        if random()<0.5:
            row['result']*=0.5
    
    return rows

In [45]:
data = wineset3()
wineprice(99.0 , 20.0)

106.07142857142857

In [46]:
weightedknn(data , [99.0 , 20.0])

87.68977938483502

In [47]:
def probguess(data , vec1 , low , high , k = 5 , weightf = gaussion):
    dlist = getdistances(data , vec1)
    nweight = 0.0
    tweight = 0.0
    
    for i in range(k):
        dist = dlist[i][0]
        idx = dlist[i][1]
        weight = weightf(dist)
        v = data[idx]['result']
        
        if v>=low and v<=high:
            nweight += weight
        
        tweight += weight
    
    if tweight == 0:
        return 0
    
    return nweight/tweight

In [48]:
probguess(data , [99,20],40,80)

0.40352254428754586

In [49]:
probguess(data , [99,20] , 30 , 120)

0.8017397586937203