In [1]:
import pandas as pd 
import numpy as np 

**Step - 1: Load the data**


In [2]:
df=pd.read_csv(r"C:\Users\devav\Downloads\diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


**Step - 2: Identify input and output variables**

In [3]:
x=df.drop("price",axis=1)
x.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [4]:
y=df["price"]
y.head()

0    326
1    326
2    327
3    334
4    335
Name: price, dtype: int64

**Step - 3: Split the data - Test and Train (recommended 75:25 split)**

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [7]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((40455, 9), (13485, 9), (40455,), (13485,))

**Step - 4: Data Preprocessing on X_train (You can use sklearn for data preprocessing)**

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [9]:
num_cols = ["carat", "depth", "table", "x", "y", "z"]
cat_cols = ["cut", "color", "clarity"] 
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

In [10]:
x_train_= preprocessor.fit_transform(x_train)
x_train_

array([[-1.15666465,  2.20783668,  0.24241403, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.08691672,  0.03851691, -0.65492279, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.52954737, -0.4513295 ,  0.24241403, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.98804345, -1.01115395,  0.24241403, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.21338262,  0.73829748,  0.69108244, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.71924623, -0.9411759 ,  0.24241403, ...,  0.        ,
         0.        ,  0.        ]])

**Step - 5: Data Preprocessing on X_test**

In [11]:
x_test_ = preprocessor.transform(x_test)
x_test_

array([[-1.1777423 ,  0.24845108, -0.65492279, ...,  0.        ,
         1.        ,  0.        ],
       [-0.46110219, -1.22108813, -0.20625438, ...,  0.        ,
         0.        ,  1.        ],
       [-0.8404999 ,  0.24845108, -1.1035912 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.86678978,  0.45838525, -1.55225961, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.44523677,  0.31842913,  0.69108244, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.50846972,  0.73829748,  0.69108244, ...,  0.        ,
         0.        ,  0.        ]])

**Step - 6: Build the model and predict on X_test (SCRATCH IMPLEMENTATION)**

In [12]:
def eu_dis(a,b): 
    return (np.sum(a-b)**2)**0.5

In [34]:
class knn_own:
    def __init__(self, k):
        self.k = k
    def fit(self, x, y):
        self.x_train = x
        self.y_train = y
    def predict(self, x_test):
        pred= []
        for test_point in x_test:
            dis = []
            for train_point in self.x_train:
                dis.append(eu_dis(test_point, train_point))
            k_id = np.argpartition(dis, self.k)[:self.k]
            k_val = []
            for idx in k_id:
                k_val.append(self.y_train[idx])
            pred.append(np.mean(k_val))
        return np.array(pred)


In [53]:
knn_scratch = knn_own(k=5)
knn_scratch.fit(x_train_[:1000], y_train[:1000].values)

In [54]:
y_pred_scratch = knn_scratch.predict(x_test_[:1000])

**Step - 7: Evaluate your model**

In [55]:
from sklearn.metrics import mean_squared_error, r2_score
r2_scratch = r2_score(y_test[:1000], y_pred_scratch[:1000])
print("Scratch KNN R2:", r2_scratch)

Scratch KNN R2: 0.758052986063712


**Step - 8: Train a model using sklearn KNN Algorithm and compare the results with your scratch implementation**

In [57]:
from sklearn.neighbors import KNeighborsRegressor

knn_sklearn = KNeighborsRegressor(n_neighbors=3)
knn_sklearn.fit(x_train_, y_train)
y_pred_sklearn = knn_sklearn.predict(x_test_)


In [58]:
r2_sklearn = r2_score(y_test, y_pred_sklearn)

print("Sklearn KNN R2:", r2_sklearn)


Sklearn KNN R2: 0.960019218726518


In [60]:
print("Scratch KNN R2:", r2_scratch)
print("Sklearn KNN R2:", r2_sklearn)


Scratch KNN R2: 0.758052986063712
Sklearn KNN R2: 0.960019218726518
