In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
import sklearn.metrics as mt

In [2]:
data = sns.load_dataset('diamonds')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
data = data.select_dtypes(include=['float64', 'int64'])
data = data.dropna()
data.head()


Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75


In [6]:
X = data.drop('price', axis=1)
y = data['price']



In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [13]:
def score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_train = model.score(X_train, y_train)
    r2_test = model.score(X_test, y_test)
    rmse_train = np.sqrt(mt.mean_squared_error(y_train, model.predict(X_train)))
    rmse_test = np.sqrt(mt.mean_squared_error(y_test, model.predict(X_test)))
    return r2_train, r2_test, rmse_train, rmse_test


In [15]:
model = LinearRegression()
r2_train, r2_test, rmse_train, rmse_test = score(model, X_train, X_test, y_train, y_test)
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")


R2 Train: 0.8592729084866024
R2 Test: 0.8589542625888937
RMSE Train: 1496.7760457375086
RMSE Test: 1497.3906985438382


In [18]:
# K Fold Cross Validation

from sklearn. model_selection import KFold


lr_cv = LinearRegression()
k = 5
it = 1
r_2 = []
rmse = []
cv = KFold(n_splits=k, shuffle=True, random_state=42)

for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    r2_train, r2_test, rmse_train, rmse_test = score(lr_cv, X_train, X_test, y_train, y_test)
    r_2.append(r2_test)
    rmse.append(rmse_test)
    print(f"Iteration {it}")
    print(f"R2 Train: {r2_train}")
    print(f"R2 Test: {r2_test}")
    print(f"RMSE Train: {rmse_train}")
    print(f"RMSE Test: {rmse_test}")
    
    it += 1

print(f"Mean R2: {np.mean(r_2)}")
print(f"Mean RMSE: {np.mean(rmse)}")


Iteration 1
R2 Train: 0.8592729084866024
R2 Test: 0.8589542625888938
RMSE Train: 1496.7760457375086
RMSE Test: 1497.390698543838
Iteration 2
R2 Train: 0.8589737560254378
R2 Test: 0.8601416526102384
RMSE Train: 1499.8197350202572
RMSE Test: 1485.3074790298463
Iteration 3
R2 Train: 0.8609526696282784
R2 Test: 0.8520393263963629
RMSE Train: 1488.0607361720429
RMSE Test: 1532.6353236199832
Iteration 4
R2 Train: 0.8587754391431481
R2 Test: 0.8597284095916333
RMSE Train: 1494.9152623107125
RMSE Test: 1510.8822157789753
Iteration 5
R2 Train: 0.8582612135831421
R2 Test: 0.8630367094043155
RMSE Train: 1503.888062239284
RMSE Test: 1468.7192692972567
Mean R2: 0.8587800721182889
Mean RMSE: 1498.9869972539798
