* **Amaç: Gözlemlerin birbirine olan benzerlikleri üzerinden tahmin yapılır.**
* Sınıflandırma ya da regresyon problemlerinde kullanılabilen bir algoritmadır.
* Parametrik olmayan bir öğrenme türüdür.
* Büyük veri setlerinde çok tercih edilmez.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
df = pd.read_csv("../Hitters.csv")  # Bir üst klasörden okuma
print(df.head()) 

   AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  CHits  CHmRun  CRuns  \
0    293    66      1    30   29     14      1     293     66       1     30   
1    315    81      7    24   38     39     14    3449    835      69    321   
2    479   130     18    66   72     76      3    1624    457      63    224   
3    496   141     20    65   78     37     11    5628   1575     225    828   
4    321    87     10    39   42     30      2     396    101      12     48   

   CRBI  CWalks League Division  PutOuts  Assists  Errors  Salary NewLeague  
0    29      14      A        E      446       33      20     NaN         A  
1   414     375      N        W      632       43      10   475.0         N  
2   266     263      A        W      880       82      14   480.0         A  
3   838     354      N        E      200       11       3   500.0         N  
4    46      33      N        E      805       40       4    91.5         N  


In [41]:
df.shape

(263, 20)

In [16]:
df = df.dropna()
dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])
y = df["Salary"]
X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis=1).astype("float64")
X = pd.concat([X_, dms], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=33)

In [17]:
X_train.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,...,CWalks,PutOuts,Assists,Errors,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
252,486.0,145.0,11.0,51.0,76.0,40.0,11.0,3967.0,1102.0,67.0,...,284.0,88.0,204.0,16.0,False,True,True,False,True,False
119,327.0,85.0,3.0,30.0,44.0,20.0,8.0,2140.0,568.0,16.0,...,93.0,91.0,185.0,12.0,True,False,True,False,True,False
56,244.0,58.0,9.0,28.0,25.0,35.0,4.0,1335.0,333.0,49.0,...,194.0,142.0,14.0,2.0,False,True,False,True,False,True
272,512.0,117.0,29.0,54.0,88.0,43.0,6.0,1750.0,412.0,100.0,...,155.0,1236.0,98.0,18.0,True,False,False,True,True,False
233,540.0,135.0,30.0,82.0,88.0,55.0,1.0,540.0,135.0,30.0,...,55.0,157.0,6.0,14.0,True,False,False,True,True,False


## Model

In [18]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [19]:
knn_model

In [20]:
knn_model.n_neighbors

5

In [21]:
dir(knn_model)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_algorithm_metric',
 '_check_feature_names',
 '_check_n_features',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_fit',
 '_fit_X',
 '_fit_method',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_kneighbors_reduce_func',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_tree',
 '_validate_data',
 '_validate

In [25]:
y_pred = knn_model.predict(X_test)

In [26]:
np.sqrt(mean_squared_error(y_test, y_pred))

276.89726436080286

## Model Tuning

In [40]:
RMSE = []
for k in range(20):
    k = k + 1
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSE.append(rmse)
    #print("k =", k,"için RMSE değeri:", rmse)

In [33]:
#GridSearchCV

In [34]:
knn_params = {"n_neighbors": np.arange(1,30,1)}

In [36]:
knn = KNeighborsRegressor()

In [85]:
knn_cv_model = GridSearchCV(knn, knn_params, cv = 19).fit(X_train, y_train)

In [86]:
knn_cv_model.best_params_

{'n_neighbors': 7}

In [87]:
#final_model
knn_tuned = KNeighborsRegressor(n_neighbors=knn_cv_model.best_params_["n_neighbors"]).fit(X_train, y_train)

In [88]:
y_pred = knn_tuned.predict(X_test)

In [89]:
np.sqrt(mean_squared_error(y_test, y_pred))

279.87291232310173