## [1] KNN분류모델 구현

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

K = 5

df = load_iris(as_frame=True)['frame'].sample(frac=1, random_state=1234)
df = df[df['target'] <= 1]
train, test = df.iloc[:75, :], df.iloc[75:, :]

train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
91,6.1,3.0,4.6,1.4,1
63,6.1,2.9,4.7,1.4,1
6,4.6,3.4,1.4,0.3,0
59,5.2,2.7,3.9,1.4,1
29,4.7,3.2,1.6,0.2,0
...,...,...,...,...,...
89,5.5,2.5,4.0,1.3,1
31,5.4,3.4,1.5,0.4,0
95,5.7,3.0,4.2,1.2,1
96,5.7,2.9,4.2,1.3,1


In [None]:
def update_dist(dist_list, elem):  #dist_list는 최근접이웃을 담은 dist_list 
    if len(dist_list) == 0:
        return [elem]
    
    if elem[1] >= dist_list[-1][1]:
        if len(dist_list) == K:
            return dist_list
        else:
            return dist_list + [elem]
    else:
        if len(dist_list) == K:
            for i, val in enumerate(dist_list):
                if val[1] > elem[1]:
                    return dist_list[:i] + [elem] + dist_list[i:-1]
        else:
            for i, val in enumerate(dist_list):
                if val[1] > elem[1]:
                    return dist_list[:i] + [elem] + dist_list[i:]
    return 1/0

# 예시
dist_list = [[0, 1], [4, 3], [1, 6], [5, 7], [4, 9]]   # 최근접 이웃의 라벨과 거리로 구성되어있음 
update_dist(dist_list, [5, 8])

[[0, 1], [4, 3], [1, 6], [5, 7], [5, 8]]

In [3]:
pred = []
for i, row_test in test.iterrows():
    dist_list = []
    for i, row_train in train.iterrows():
        tmp = np.sqrt(((row_train[:-1] - row_test[:-1])**2).sum())
        dist_list = update_dist(dist_list, [row_train[-1], tmp])
    c0 = 0
    c1 = 0
    for elem in dist_list:
        if elem[0] == 0: c0 += 1
        elif elem[0] == 1: c1 += 1
    if c0 >= c1: s = 0
    elif c1 > c0: s = 1

    pred.append(s)

pred = np.array(pred)

print(f"테스트 데이터셋 기준 정확도:{(test['target'] == pred).mean()*100:.2f}%")

테스트 데이터셋 기준 정확도:100.00%


## [2] KNN분류모델 구현, 소요시간 비교

In [2]:
import itertools
import time
import pandas as pd

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

n_samples_list = [2000, 10000, 20000]
n_features_list = [200, 1000, 2000]

In [3]:
n_sampless, n_featuress, OLS_trains, DT_trains, KNN_trains, OLS_tests, DT_tests, KNN_tests = [], [], [], [], [], [], [], []

In [17]:
pip install threadpoolctl==3.1.0

Collecting threadpoolctl==3.1.0
  Obtaining dependency information for threadpoolctl==3.1.0 from https://files.pythonhosted.org/packages/61/cf/6e354304bcb9c6413c4e02a747b600061c21d38ba51e7e544ac7bc66aecc/threadpoolctl-3.1.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.1.0-py3-none-any.whl.metadata (9.2 kB)
Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl
  Attempting uninstall: threadpoolctl
    Found existing installation: threadpoolctl 2.2.0
    Uninstalling threadpoolctl-2.2.0:
      Successfully uninstalled threadpoolctl-2.2.0
Successfully installed threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
for n_samples, n_features in itertools.product(n_samples_list,
                                               n_features_list):
    X, y = make_regression(n_samples=n_samples,
                           n_features=n_features,
                           random_state=1234)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=1000,
                                                        random_state=1234)

    time1 = time.time()
    reg_OLS = LinearRegression().fit(X_train, y_train)
    time2 = time.time()
    y_pred = reg_OLS.predict(X_test)
    time3 = time.time()
    OLS_trains.append(time2 - time1)
    OLS_tests.append(time3 - time2)

    time1 = time.time()
    reg_DT = DecisionTreeRegressor(max_depth=4,
                                   random_state=1234).fit(X_train, y_train)
    time2 = time.time()
    y_pred = reg_DT.predict(X_test)
    time3 = time.time()
    DT_trains.append(time2 - time1)
    DT_tests.append(time3 - time2)

    time1 = time.time()
    reg_KNN = KNeighborsRegressor().fit(X_train, y_train)
    time2 = time.time()
    y_pred = reg_KNN.predict(X_test)
    time3 = time.time()
    KNN_trains.append(time2 - time1)
    KNN_tests.append(time3 - time2)

    n_sampless.append(n_samples)
    n_featuress.append(n_features)

In [5]:
df=pd.DataFrame({'n_samples':n_sampless, 'n_features':n_featuress, 'OLS_train':OLS_trains,\
                 'DT_train':DT_trains, 'KNN_train':KNN_trains, 'OLS_test':OLS_tests,\
                 'DT_test':DT_tests, 'KNN_test':KNN_tests})

df.round(2)

Unnamed: 0,n_samples,n_features,OLS_train,DT_train,KNN_train,OLS_test,DT_test,KNN_test
0,2000,200,0.04,0.14,0.0,0.0,0.0,0.29
1,2000,1000,0.18,0.71,0.0,0.0,0.0,0.02
2,2000,2000,0.29,1.44,0.0,0.0,0.0,0.05
3,10000,200,0.01,0.14,0.0,0.0,0.0,0.03
4,10000,1000,0.17,0.72,0.0,0.01,0.02,0.08
5,10000,2000,0.25,1.45,0.0,0.01,0.03,0.19
6,20000,200,0.02,0.14,0.0,0.0,0.01,0.05
7,20000,1000,0.17,0.75,0.0,0.01,0.03,0.18
8,20000,2000,0.25,1.52,0.0,0.03,0.06,0.35
