In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('../data/possum.csv')

In [3]:
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [4]:
df.drop(columns=['case', 'Pop', 'site', 'sex'], inplace=True)

In [5]:
df

Unnamed: 0,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
...,...,...,...,...,...,...,...,...,...,...
99,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0


In [6]:
df.isna().sum()

age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
X = df.drop(columns=['age']).values
y = df['age'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=44)

In [10]:
X_train[0]

array([91. , 55. , 84.5, 36. , 72.8, 51.4, 13.6, 27. , 30. ])

In [11]:
X_test[0]

array([94.5, 64.2, 91. , 39. , 66.5, 46.4, 14.4, 30.5, 33. ])

In [12]:
from math import sqrt
def euclidean_distance_dummy(vec1, vec2):
    distance = 0.0
    for i in range(len(vec1)):
        distance += (vec1[i] - vec2[i]) ** 2
    return sqrt(distance)

In [13]:
euclidean_distance_dummy(X_train[0], X_test[0])

15.320574401764446

In [14]:
def get_neighbors_dummy(train, test_row, num_neighbors):
    distances = []
    for train_id, train_row in enumerate(train):
        dist = euclidean_distance_dummy(train_row, test_row)
        distances.append((train_id, dist))
    distances.sort(key=lambda x: x[1])

    nearest_neighbor_ids = []
    for i in range(num_neighbors):
        nearest_neighbor_ids.append(distances[i][0])
    return nearest_neighbor_ids

In [15]:
X_train[:5]

array([[91. , 55. , 84.5, 36. , 72.8, 51.4, 13.6, 27. , 30. ],
       [93.1, 54.8, 90.5, 35.5, 73.2, 53.6, 14.2, 30. , 32. ],
       [88.7, 52. , 83. , 38. , 61.5, 45.9, 14.7, 26. , 34. ],
       [97.6, 61. , 93.5, 40. , 67.9, 44.3, 15.8, 28.5, 32.5],
       [91.6, 56.6, 88.5, 37.5, 64.5, 45.4, 14.9, 27. , 31. ]])

In [16]:
X_test[1]

array([90.6, 56. , 85.5, 38. , 65.6, 41.7, 17. , 27.5, 35. ])

In [17]:
get_neighbors_dummy(X_train[:5], X_test[1], 3)

[4, 2, 3]

In [18]:
def predict_dummy(X_train, X_test, y_train, num_neighbors = 3):
    y_predict = []
    for x_test in X_test:
        nearest_neighbor_ids = get_neighbors_dummy(X_train, x_test, num_neighbors)
        y_preds = y_train[nearest_neighbor_ids]
        y_preds = y_preds.mean()
        y_predict.append(y_preds)
        
    return y_predict 

In [19]:
y_predict = predict_dummy(X_train[:30], X_test[:5], y_train[:30], num_neighbors = 5)
y_predict

[np.float64(4.4),
 np.float64(3.4),
 np.float64(2.8),
 np.float64(3.4),
 np.float64(3.4)]

In [20]:
model = KNeighborsRegressor(n_neighbors=5)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [21]:
y_pred

array([4.4, 4. , 3.2, 5.8, 4. , 4. , 4.6, 2.4, 4.6, 3.8, 2. , 5. , 3. ,
       5.2, 5.8, 5. , 2.2, 2.8, 4.8, 1.6, 3. ])

In [24]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error

pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

MSE_train = root_mean_squared_error(y_train, pred_train)
RMSE_train = np.sqrt(MSE_train)
R2_train = r2_score(y_train, pred_train)
MAE_train = mean_absolute_error(y_train, pred_train)

MSE_test = root_mean_squared_error(y_test, pred_test)
RMSE_test = np.sqrt(MSE_test)
R2_test = r2_score(y_test, pred_test)
MAE_test = mean_absolute_error(y_test, pred_test)

print(f'MSE на обучении {MSE_train:.2f}')
print(f'MSE на тестовой {MSE_test:.2f}')

print(f'RMSE на обучении {RMSE_train:.2f}')
print(f'RMSE на тестовой {RMSE_test:.2f}')

print(f'R2 на обученит {R2_train:.2f}')
print(f'R2 на тестовой {R2_test:.2f}')

print(f'MAE на обучении {MAE_train:.2f}')
print(f'MAE на тестовой {MAE_test:.2f}')

MSE на обучении 1.60
MSE на тестовой 1.35
RMSE на обучении 1.26
RMSE на тестовой 1.16
R2 на обученит 0.33
R2 на тестовой 0.31
MAE на обучении 1.32
MAE на тестовой 1.15


In [25]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=44)
model.fit(X_train, y_train)

In [26]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

MSE_train = root_mean_squared_error(y_train, pred_train)
RMSE_train = np.sqrt(MSE_train)
R2_train = r2_score(y_train, pred_train)
MAE_train = mean_absolute_error(y_train, pred_train)

MSE_test = root_mean_squared_error(y_test, pred_test)
RMSE_test = np.sqrt(MSE_test)
R2_test = r2_score(y_test, pred_test)
MAE_test = mean_absolute_error(y_test, pred_test)

print(f'MSE на обучении {MSE_train:.2f}')
print(f'MSE на тестовой {MSE_test:.2f}')

print(f'RMSE на обучении {RMSE_train:.2f}')
print(f'RMSE на тестовой {RMSE_test:.2f}')

print(f'R2 на обученит {R2_train:.2f}')
print(f'R2 на тестовой {R2_test:.2f}')

print(f'MAE на обучении {MAE_train:.2f}')
print(f'MAE на тестовой {MAE_test:.2f}')

MSE на обучении 0.00
MSE на тестовой 2.01
RMSE на обучении 0.00
RMSE на тестовой 1.42
R2 на обученит 1.00
R2 на тестовой -0.55
MAE на обучении 0.00
MAE на тестовой 1.57


In [29]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

scores = cross_validate(DecisionTreeRegressor(random_state=44), X, y, cv=5,
                       scoring={'r2': make_scorer(r2_score),
                                'root_mean_squared_error': make_scorer(root_mean_squared_error)},
                       return_train_score=True)

print('R2 train mean = ', scores['train_r2'].mean())
print('R2 test mean = ', scores['test_r2'].mean())

print('MSE train mean = ', scores['train_root_mean_squared_error'].mean())
print('MSE test mean = ', scores['test_root_mean_squared_error'].mean())

R2 train mean =  1.0
R2 test mean =  -0.7644861990897471
MSE train mean =  0.0
MSE test mean =  2.403769539566122


In [30]:
?DecisionTreeRegressor

[0;31mInit signature:[0m
[0mDecisionTreeRegressor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'squared_error'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplitter[0m[0;34m=[0m[0;34m'best'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34

In [31]:
model.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 44,
 'splitter': 'best'}

In [32]:
model = DecisionTreeRegressor(random_state=44)
model.fit(X_train, y_train)

print(f'MSE_train =  {root_mean_squared_error(y_train, model.predict(X_train))}')
print(f'MSE_test =  {root_mean_squared_error(y_test, model.predict(X_test))}')


MSE_train =  0.0
MSE_test =  1.9023794624226837


In [33]:
model = DecisionTreeRegressor(random_state=1,
                              max_depth=4,
                              min_samples_leaf=1,
                              max_leaf_nodes=None)
model.fit(X_train, y_train)

print(f'MSE_train =  {root_mean_squared_error(y_train, model.predict(X_train))}')
print(f'MSE_test =  {root_mean_squared_error(y_test, model.predict(X_test))}')


MSE_train =  1.2308015308299958
MSE_test =  1.5078186348514533


In [34]:
model = DecisionTreeRegressor(random_state=1,
                              max_depth=4,
                              min_samples_leaf=2,
                              max_leaf_nodes=None)
model.fit(X_train, y_train)

print(f'MSE_train =  {root_mean_squared_error(y_train, model.predict(X_train))}')
print(f'MSE_test =  {root_mean_squared_error(y_test, model.predict(X_test))}')


MSE_train =  1.235869090273505
MSE_test =  1.5274294894132134


In [35]:
model = DecisionTreeRegressor(random_state=1,
                              max_depth=4,
                              min_samples_leaf=1,
                              max_leaf_nodes=3)
model.fit(X_train, y_train)

print(f'MSE_train =  {root_mean_squared_error(y_train, model.predict(X_train))}')
print(f'MSE_test =  {root_mean_squared_error(y_test, model.predict(X_test))}')


MSE_train =  1.5862281583377367
MSE_test =  1.1465340519412632


In [36]:
from sklearn.model_selection import GridSearchCV

In [40]:
model = DecisionTreeRegressor()

param_grid = {
    'max_depth': np.arange(1, 5),
    'min_samples_leaf': [1, 2, 3]
}

gridsearch = GridSearchCV(model, param_grid, refit=True, scoring=make_scorer(r2_score))

gridsearch.fit(X_train, y_train)
print(gridsearch.best_params_)

best_model = gridsearch.best_estimator_

print(f'MSE train = {root_mean_squared_error(y_train, best_model.predict(X_train))}')
print(f'MSE test = {root_mean_squared_error(y_test, best_model.predict(X_test))}')


{'max_depth': np.int64(1), 'min_samples_leaf': 1}
MSE train = 1.6937877250000348
MSE test = 1.1714632962469291


In [41]:
import pickle

filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))