In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.read_csv('../data/possum.csv')

In [4]:
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [7]:
df.drop(columns=['case', 'Pop', 'site', 'sex'], inplace=True)

In [8]:
df

Unnamed: 0,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
...,...,...,...,...,...,...,...,...,...,...
99,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0


In [9]:
df.isna().sum()

age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [10]:
df.dropna(inplace=True)

In [11]:
X = df.drop(columns=['age']).values
y = df['age'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=44)

In [14]:
X_train[0]

array([91. , 55. , 84.5, 36. , 72.8, 51.4, 13.6, 27. , 30. ])

In [15]:
X_test[0]

array([94.5, 64.2, 91. , 39. , 66.5, 46.4, 14.4, 30.5, 33. ])

In [16]:
from math import sqrt
def euclidean_distance_dummy(vec1, vec2):
    distance = 0.0
    for i in range(len(vec1)):
        distance += (vec1[i] - vec2[i]) ** 2
    return sqrt(distance)

In [17]:
euclidean_distance_dummy(X_train[0], X_test[0])

15.320574401764446

In [18]:
def get_neighbors_dummy(train, test_row, num_neighbors):
    distances = []
    for train_id, train_row in enumerate(train):
        dist = euclidean_distance_dummy(train_row, test_row)
        distances.append((train_id, dist))
    distances.sort(key=lambda x: x[1])

    nearest_neighbor_ids = []
    for i in range(num_neighbors):
        nearest_neighbor_ids.append(distances[i][0])
    return nearest_neighbor_ids

In [19]:
X_train[:5]

array([[91. , 55. , 84.5, 36. , 72.8, 51.4, 13.6, 27. , 30. ],
       [93.1, 54.8, 90.5, 35.5, 73.2, 53.6, 14.2, 30. , 32. ],
       [88.7, 52. , 83. , 38. , 61.5, 45.9, 14.7, 26. , 34. ],
       [97.6, 61. , 93.5, 40. , 67.9, 44.3, 15.8, 28.5, 32.5],
       [91.6, 56.6, 88.5, 37.5, 64.5, 45.4, 14.9, 27. , 31. ]])

In [20]:
X_test[1]

array([90.6, 56. , 85.5, 38. , 65.6, 41.7, 17. , 27.5, 35. ])

In [21]:
get_neighbors_dummy(X_train[:5], X_test[1], 3)

[4, 2, 3]

In [22]:
def predict_dummy(X_train, X_test, y_train, num_neighbors = 3):
    y_predict = []
    for x_test in X_test:
        nearest_neighbor_ids = get_neighbors_dummy(X_train, x_test, num_neighbors)
        y_preds = y_train[nearest_neighbor_ids]
        y_preds = y_preds.mean()
        y_predict.append(y_preds)
        
    return y_predict 

In [23]:
y_predict = predict_dummy(X_train[:30], X_test[:5], y_train[:30], num_neighbors = 5)
y_predict

[np.float64(4.4),
 np.float64(3.4),
 np.float64(2.8),
 np.float64(3.4),
 np.float64(3.4)]

In [26]:
model = KNeighborsRegressor(n_neighbors=5)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [27]:
y_pred

array([4.4, 4. , 3.2, 5.8, 4. , 4. , 4.6, 2.4, 4.6, 3.8, 2. , 5. , 3. ,
       5.2, 5.8, 5. , 2.2, 2.8, 4.8, 1.6, 3. ])