# K-Fold CrossValidation

## Importing required libraries

In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score

## Load data 

In [2]:
df = pd.read_csv('../data/kc_house_price_data_cleaned.csv', index_col=0)

In [3]:
df.head(10)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,1800,7503
5,1225000.0,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,4760,101930
6,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,2238,6819
7,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,1650,9711
8,229500.0,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,1960,0,1780,8113
9,323000.0,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,2003,0,2390,7570


## Machine Learning 

### Define Output and Inputs

#### Convert the output to array format to work with the kfold model validation

In [4]:
y = (np.array(df['price'], dtype='float64').reshape(-1,1))
X = df.drop('price', axis = 1)
X = np.array(X)

In [5]:
X.shape, y.shape

((21613, 15), (21613, 1))

### Split dataset for Cross Validation

In [6]:
kf = KFold(n_splits=2)

#### We made n_splits splits with test and train partitions each one

In [7]:
indices_list = list(kf.split(X))
indices_list

[(array([10807, 10808, 10809, ..., 21610, 21611, 21612]),
  array([    0,     1,     2, ..., 10804, 10805, 10806])),
 (array([    0,     1,     2, ..., 10804, 10805, 10806]),
  array([10807, 10808, 10809, ..., 21610, 21611, 21612]))]

In [8]:
i = 1

for train, test in kf.split(X):
    print('\nTrain' + str(i), train)
    print('Test' + str(i), test)

i += 1


Train1 [10807 10808 10809 ... 21610 21611 21612]
Test1 [    0     1     2 ... 10804 10805 10806]

Train1 [    0     1     2 ... 10804 10805 10806]
Test1 [10807 10808 10809 ... 21610 21611 21612]


In [9]:
len(X), len(train), len(test)

(21613, 10807, 10806)

In [10]:
indices_list[0]

(array([10807, 10808, 10809, ..., 21610, 21611, 21612]),
 array([    0,     1,     2, ..., 10804, 10805, 10806]))

In [11]:
X_train1, y_train1 = X[indices_list[0][0]], y[indices_list[0][0]]
X_test1, y_test1 = X[indices_list[0][1]], y[indices_list[0][1]]

X_train2, y_train2 = X[indices_list[1][0]], y[indices_list[1][0]]
X_test2, y_test2 = X[indices_list[1][1]], y[indices_list[1][1]]

In [12]:
X_train1.shape, y_train1.shape

((10806, 15), (10806, 1))

In [13]:
X_test1.shape, y_test1.shape

((10807, 15), (10807, 1))

In [14]:
X_train2.shape, y_train2.shape

((10807, 15), (10807, 1))

In [15]:
X_test2.shape, y_test2.shape

((10806, 15), (10806, 1))

### Linear Regression Model

#### Manually split

##### First model

In [16]:
reg = LinearRegression().fit(X_train1, y_train1)
reg.score(X_test1, y_test1)

0.6455261843186866

##### Second model

In [17]:
reg = LinearRegression().fit(X_train2, y_train2)
reg.score(X_test2, y_test2)

0.6442680760321458

#### Cross Validation Score

In [18]:
cross_val_score(reg, X,y, cv=2)

array([0.64552618, 0.64426808])