# Sprint 2

## Introduction to Sprint - Machine Learning Scratch

### [Problem 1] Scratch of train_test_split

In [1]:
import numpy as np
import pandas as pd
from math import ceil

In [2]:
def scratch_train_test_split(X, y, train_size=0.8):
    """Split the validation data.
    Parameters
    ----------
    X : ndarray
      Training data (n_samples, n_features)
    y : ndarray
      Correct value (n_samples,)
    train_size : float
      Specify what percentage to use as train (0 < train_size < 1)
    Returns
    -------
    X_train : ndarray
      Training data (n_samples, n_features)
    X_test : ndarray
      Validation data (n_samples, n_features)
    y_train : ndarray
      Correct value of training data (n_samples,)
    y_test : ndarray
      Correct value of verification data (n_samples,)
    """
    # TODO
    if isinstance(y, range) or isinstance(y, list):
        y = np.array(y)
    assert X.shape[0] > 0 and y.shape[0] > 0, 'At least one row required in X and y'
    assert X.shape[0] == y.shape[0], '2 array must have same n_samples'
    assert train_size > 0 and train_size < 1, 'train_size must be in [0, 1]'
    
    n_samples = X.shape[0]
    n_train = ceil(n_samples * train_size)
    
    train_idx = np.random.choice(X.shape[0], size=n_train, replace=False)
    test_idx = [i for i in range(n_samples) if i not in train_idx]
    
    X_train = X[train_idx, :]
    y_train = y[train_idx]
    X_test = X[test_idx, :]
    y_test = y[test_idx]
    return X_train, X_test, y_train, y_test

In [3]:
X, y = np.arange(20).reshape((10, 2)), np.arange(10)
print(X)
print(y)

[[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]]
[0 1 2 3 4 5 6 7 8 9]


In [4]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.6)
print('X_train\n', X_train)
print('y_train\n', y_train)
print('X_test\n', X_test)
print('y_test\n', y_test)

X_train
 [[10 11]
 [ 8  9]
 [14 15]
 [12 13]
 [ 6  7]
 [ 4  5]]
y_train
 [5 4 7 6 3 2]
X_test
 [[ 0  1]
 [ 2  3]
 [16 17]
 [18 19]]
y_test
 [0 1 8 9]


In [5]:
print(X.shape, y.shape)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(10, 2) (10,)
(6, 2) (6,) (4, 2) (4,)


In [6]:
# Simple data set 1 creation code
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, n_samples // 2)
f1 = np.random.multivariate_normal(f1, cov, n_samples // 2)
X1 = np.concatenate([f0, f1])
y1 = np.concatenate([
    np.full(n_samples // 2, 1),
    np.full(n_samples // 2, -1)
])

In [7]:
X1_train, X1_test, y1_train, y1_test = scratch_train_test_split(X1, y1)
print(X1.shape, y1.shape)
print(X1_train.shape, y1_train.shape, X1_test.shape, y1_test.shape)

(500, 2) (500,)
(400, 2) (400,) (100, 2) (100,)


In [8]:
# Simple data set 2 creation code
X2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [9]:
X2_train, X2_test, y2_train, y2_test = scratch_train_test_split(X2, y2)
print(X2.shape, y2.shape)
print(X2_train.shape, y2_train.shape, X2_test.shape, y2_test.shape)

(40, 2) (40,)
(32, 2) (32,) (8, 2) (8,)


### [Problem 2] Creating a code to solve the classification problem

In [10]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.datasets import load_iris

In [11]:
# Classification on iris dataset
X = pd.DataFrame(data=load_iris().data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
y = pd.DataFrame(load_iris().target, columns=[('Species')])
df = pd.concat([X, y], axis=1)
df = df[df.Species != 0]
X = df.drop(columns=['Species']).values
y = df['Species'].values

In [12]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)

In [13]:
from sklearn.preprocessing import StandardScaler

def scale(X_train, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

In [14]:
X_train_scaled, X_test_scaled = scale(X_train, X_test)

In [15]:
sgd = SGDClassifier(loss='log')
sgd.fit(X_train_scaled, y_train)
sgd.score(X_test_scaled, y_test)

0.9

In [16]:
sgd_predict = sgd.predict(X_test_scaled)
result_sgd = pd.DataFrame([sgd_predict, y_test], index=['Predict', 'Actual value'])
result_sgd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Predict,1,2,1,1,1,2,1,1,1,1,1,2,2,2,2,2,2,2,2,2
Actual value,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2


In [17]:
svc = SVC()
svc.fit(X_train_scaled, y_train)
svc.score(X_test_scaled, y_test)

0.9

In [18]:
svc_predict = svc.predict(X_test_scaled)
result_svc = pd.DataFrame([svc_predict, y_test], index=['Predict', 'Actual value'])
result_svc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Predict,1,2,1,1,1,2,1,1,1,1,1,2,2,2,2,2,2,2,2,2
Actual value,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2


In [19]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_scaled, y_train)
dtc.score(X_test_scaled, y_test)

0.85

In [20]:
dtc_predict = dtc.predict(X_test_scaled)
result_dtc = pd.DataFrame([dtc_predict, y_test], index=['Predict', 'Actual value'])
result_dtc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Predict,1,2,1,1,1,2,1,1,1,1,1,2,2,2,2,2,2,2,1,2
Actual value,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2


In [21]:
# Classification on simple data set 1 
# (scratch_train_test_split on the dataset has already been done
# in problem 1)
X1_train_scaled, X1_test_scaled = scale(X1_train, X1_test)

In [22]:
sgd = SGDClassifier(loss='log')
sgd.fit(X1_train_scaled, y1_train)
sgd.score(X1_test_scaled, y1_test)

1.0

In [23]:
pd.set_option('display.max_columns', None)
sgd_predict = sgd.predict(X1_test_scaled)
result_sgd = pd.DataFrame([sgd_predict, y1_test], index=['Predict', 'Actual value'])
result_sgd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
Predict,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Actual value,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [24]:
svc = SVC()
svc.fit(X1_train_scaled, y1_train)
svc.score(X1_test_scaled, y1_test)

1.0

In [25]:
svc_predict = svc.predict(X1_test_scaled)
result_svc = pd.DataFrame([svc_predict, y1_test], index=['Predict', 'Actual value'])
result_svc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
Predict,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Actual value,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [26]:
dtc = DecisionTreeClassifier()
dtc.fit(X1_train_scaled, y1_train)
dtc.score(X1_test_scaled, y1_test)

1.0

In [27]:
dtc_predict = dtc.predict(X1_test_scaled)
result_dtc = pd.DataFrame([dtc_predict, y1_test], index=['Predict', 'Actual value'])
result_dtc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
Predict,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Actual value,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [28]:
# Classification on simple data set 2
# (scratch_train_test_split on the dataset has already been done
# in problem 1)
X2_train_scaled, X2_test_scaled = scale(X2_train, X2_test)

In [29]:
sgd = SGDClassifier(loss='log')
sgd.fit(X2_train_scaled, y2_train)
sgd.score(X2_test_scaled, y2_test)

0.75

In [30]:
sgd_predict = sgd.predict(X2_test_scaled)
result_sgd = pd.DataFrame([sgd_predict, y2_test], index=['Predict', 'Actual value'])
result_sgd

Unnamed: 0,0,1,2,3,4,5,6,7
Predict,1,0,1,0,1,1,1,1
Actual value,0,0,0,0,1,1,1,1


In [31]:
svc = SVC()
svc.fit(X2_train_scaled, y2_train)
svc.score(X2_test_scaled, y2_test)

0.625

In [32]:
svc_predict = svc.predict(X2_test_scaled)
result_svc = pd.DataFrame([svc_predict, y2_test], index=['Predict', 'Actual value'])
result_svc

Unnamed: 0,0,1,2,3,4,5,6,7
Predict,0,1,0,1,0,1,1,1
Actual value,0,0,0,0,1,1,1,1


In [33]:
dtc = DecisionTreeClassifier()
dtc.fit(X2_train_scaled, y2_train)
dtc.score(X2_test_scaled, y2_test)

0.375

In [34]:
dtc_predict = dtc.predict(X2_test_scaled)
result_dtc = pd.DataFrame([dtc_predict, y2_test], index=['Predict', 'Actual value'])
result_dtc

Unnamed: 0,0,1,2,3,4,5,6,7
Predict,0,1,1,1,1,0,0,1
Actual value,0,0,0,0,1,1,1,1


In [35]:
df_house = pd.read_csv('train.csv')
df_house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [36]:
X_house = df_house[['GrLivArea', 'YearBuilt']].values
y_house = df_house['SalePrice'].values

In [37]:
X_house_train, X_house_test, y_house_train, y_house_test = scratch_train_test_split(X_house, y_house, train_size=0.8)
X_house_train_scaled, X_house_test_scaled = scale(X_house_train, X_house_test)

In [38]:
from sklearn.linear_model import SGDRegressor
pd.set_option("precision", 1)
sgd_house = SGDRegressor()
sgd_house.fit(X_house_train_scaled, y_house_train)
# Get coefficient of determination R^2
sgd_house.score(X_house_test_scaled, y_house_test)

0.6875579107432963

In [39]:
sgd_house_predict = sgd_house.predict(X_house_test_scaled)
result_sgdh = pd.DataFrame([sgd_house_predict, y_house_test], index=['Predict', 'Actual value'])
result_sgdh

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291
Predict,233907.8,105473.7,216653.1,131719.6,39866.3,142362.7,156687.9,130815.4,133784.6,275498.9,109504.3,222879.3,149401.1,204905.2,80627.8,235086.2,111816.7,193220.9,224259.9,143977.0,234689.7,218070.5,62221.5,140890.1,194471.8,44865.6,87816.1,125764.8,204171.2,232433.4,109831.8,126083.0,174798.0,133779.9,195953.7,148405.9,241452.1,238321.1,127560.2,133752.3,144517.8,257431.1,324669.3,132881.3,235925.0,222192.2,251020.9,185731.9,188722.1,70317.6,119794.2,132248.5,131178.7,229064.1,147182.5,284673.1,235613.2,158913.8,183088.3,148464.7,235653.7,250227.1,144987.9,298856.6,224143.2,321057.2,130737.2,226233.8,256132.3,251596.7,332493.2,253250.5,59884.2,177032.2,151212.2,248833.6,253992.8,192140.1,226340.5,149601.6,108394.1,80233.2,102469.6,123437.6,198151.1,155508.7,120600.0,87994.6,259875.0,134506.6,315720.5,142521.0,96612.3,75177.0,96422.8,211191.3,125143.9,206730.2,106165.5,143688.1,229916.8,223210.4,225203.7,206930.7,132908.0,213052.9,160889.6,195460.7,193713.9,145406.4,105619.0,152826.5,269695.8,239494.8,236706.8,291601.2,260966.0,216842.6,109360.8,109686.5,277563.9,106688.8,157378.6,189543.6,222850.8,207936.9,254427.0,131566.9,249366.2,286204.6,160623.7,164559.6,119530.3,150889.4,130610.3,96286.6,226988.1,141126.5,137180.6,118161.6,120576.1,134743.0,139955.6,203077.6,42543.0,126106.9,319162.4,73322.7,183139.8,118455.9,215019.5,216444.3,135355.6,167812.1,300239.2,282427.0,132734.1,224883.5,218240.7,125577.1,235464.2,212003.4,150320.0,134275.8,143633.0,211983.2,212539.7,126236.6,99399.3,223916.8,114982.7,227276.0,205619.9,245182.0,173808.3,133318.2,167463.4,124615.0,203102.5,229368.6,120595.4,218762.3,152241.4,161520.6,126572.3,142361.8,285902.1,116073.7,155506.8,133595.1,112729.2,177308.1,187028.8,150510.4,214865.0,118846.8,214635.9,59611.0,118886.4,221920.9,260774.6,188007.4,134794.5,64124.6,127066.3,133199.5,163696.9,199623.7,68651.9,208368.4,127457.2,202182.6,140056.7,115575.1,232123.4,229660.1,163931.4,193688.2,99035.0,284163.6,156066.9,143266.0,234882.8,192182.4,24609.4,228803.8,220423.4,100001.7,112574.6,200539.8,115768.2,31610.2,163722.6,193198.9,406967.6,143231.0,332229.3,178044.1,228101.1,238534.5,163619.6,251612.4,160043.3,149594.3,225354.5,189026.6,218640.9,159555.9,215705.7,234363.2,121101.3,325730.0,261749.6,139322.7,142879.7,234788.1,156066.9,181658.9,235622.4,203199.9,239063.4,162720.0,71584.2,245170.0,126745.2,170472.1,208095.2,155163.9,285077.0,216362.5,116829.7,139800.1,260557.5,147772.2,113286.6,218148.8,92779.4,111500.3,248742.5,161402.9,224828.4,298667.2,145365.9,152241.4,151094.5,80679.3,248174.1,119525.6,139792.8,209655.2,153311.2,140593.9
Actual value,208500.0,118000.0,279500.0,149000.0,68500.0,153000.0,160000.0,130250.0,141000.0,239686.0,114500.0,385000.0,130000.0,202500.0,80000.0,225000.0,135750.0,153500.0,245000.0,98600.0,204750.0,214000.0,83000.0,128950.0,198900.0,100000.0,115000.0,115000.0,180000.0,217000.0,100000.0,136000.0,153900.0,128000.0,174000.0,143000.0,171000.0,260000.0,115000.0,122000.0,235000.0,254900.0,412500.0,127500.0,183500.0,239000.0,211000.0,104000.0,143900.0,98000.0,125000.0,94500.0,89500.0,235000.0,145000.0,230000.0,207500.0,151000.0,175500.0,148000.0,192000.0,239000.0,157000.0,267000.0,375000.0,342643.0,126175.0,284000.0,377426.0,202900.0,437154.0,318061.0,105900.0,177500.0,134000.0,280000.0,198500.0,190000.0,172400.0,125000.0,106500.0,100000.0,109000.0,123000.0,164990.0,115000.0,145000.0,115000.0,228000.0,142000.0,315000.0,135000.0,79900.0,119000.0,110000.0,190000.0,120000.0,188000.0,98000.0,124000.0,250000.0,187000.0,251000.0,208900.0,156000.0,172785.0,184000.0,289000.0,208300.0,164900.0,129900.0,134000.0,402861.0,158000.0,211000.0,315750.0,200624.0,179665.0,129500.0,108000.0,268000.0,120000.0,164500.0,131500.0,185900.0,194201.0,275000.0,141000.0,314813.0,305000.0,139900.0,153000.0,139400.0,93000.0,143250.0,84500.0,191000.0,88000.0,149000.0,103600.0,128500.0,143000.0,159434.0,194700.0,89000.0,123500.0,361919.0,102776.0,130500.0,124500.0,222000.0,236500.0,131400.0,175000.0,299800.0,236000.0,156000.0,215200.0,196500.0,142500.0,144000.0,175900.0,165500.0,139000.0,146800.0,189000.0,227680.0,135500.0,116000.0,224900.0,137000.0,183000.0,189000.0,237000.0,167000.0,100000.0,144000.0,130500.0,171000.0,213000.0,133500.0,187000.0,131500.0,164000.0,127000.0,147000.0,250000.0,189950.0,152000.0,132500.0,66500.0,157000.0,145000.0,165000.0,180000.0,125500.0,174000.0,35311.0,117500.0,242000.0,253000.0,143000.0,127500.0,37900.0,135000.0,99500.0,167500.0,165000.0,85500.0,126000.0,121600.0,136905.0,88000.0,119200.0,293077.0,315500.0,97000.0,115000.0,84900.0,335000.0,126000.0,83500.0,137500.0,197000.0,60000.0,181000.0,188000.0,129900.0,112000.0,163900.0,115000.0,94000.0,149900.0,230000.0,625000.0,163000.0,285000.0,186700.0,195000.0,189000.0,158000.0,219210.0,148000.0,140000.0,112000.0,143000.0,137900.0,130000.0,265900.0,244000.0,128900.0,381000.0,281000.0,180500.0,154000.0,130000.0,138000.0,179200.0,206900.0,208900.0,275000.0,156500.0,82500.0,147000.0,130500.0,167900.0,215000.0,122000.0,200000.0,170000.0,110000.0,149900.0,315000.0,104900.0,105000.0,136000.0,105000.0,133000.0,257000.0,207000.0,175900.0,340000.0,127500.0,136500.0,144000.0,119000.0,192140.0,120500.0,149700.0,197000.0,149300.0,157900.0


### [Problem 3] Creating a code to solve the regression problem

In [40]:
from sklearn.linear_model import LinearRegression

X_house_train, X_house_test, y_house_train, y_house_test = scratch_train_test_split(X_house, y_house, train_size=0.8)
lr = LinearRegression()
lr.fit(X_house_train_scaled, y_house_train)
lr.score(X_house_test_scaled, y_house_test)

0.004352933440801898

In [41]:
lr_predict = lr.predict(X_house_test_scaled)
result_lr = pd.DataFrame([lr_predict, y_house_test], index=['Predict', 'Actual value'])
result_lr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291
Predict,179767.7,181545.7,179312.8,180110.3,181216.1,181237.3,180808.0,179796.8,180801.3,180431.7,181909.1,180907.7,181289.9,179125.2,180790.1,182777.5,180630.0,179417.3,179374.4,180844.4,180139.1,179754.2,181991.7,180914.7,178779.2,183808.3,180725.4,183066.0,179472.4,182735.1,181615.3,180738.2,179422.2,179784.3,181135.8,180556.2,180785.5,179718.5,182077.8,179843.7,180912.8,180083.5,181216.7,181923.4,179740.1,180537.8,180938.2,184442.1,181498.9,181821.9,180099.4,179999.1,180041.9,179510.9,181134.8,180817.5,179794.9,180125.7,180272.4,180018.7,181350.9,180387.2,181339.1,181103.7,181885.0,180979.6,180991.9,179645.2,180002.9,180109.9,181222.0,179837.3,181475.9,179338.3,180062.6,180305.1,180088.4,180716.0,179826.4,180036.9,180395.9,181023.1,181437.9,179858.5,179077.2,182524.0,180830.1,181984.5,180780.5,180274.5,181373.0,181718.4,182540.8,181839.7,182537.8,181618.4,181321.4,180949.0,182932.6,179822.8,179524.5,180195.1,179808.2,179695.9,180428.4,179255.3,183267.9,181786.0,178767.1,181465.4,180411.5,179669.6,179980.2,181711.3,180111.5,181107.5,182951.4,179315.8,179753.4,182749.5,181122.7,180368.7,180759.2,181990.6,179531.4,179173.6,179975.7,182081.9,179775.2,180602.6,181170.0,181531.9,180872.8,181373.5,180032.8,179544.6,180076.0,180200.7,182530.5,182585.7,180470.8,179560.4,182514.9,179155.9,181617.8,181097.5,180949.4,183365.4,180572.3,179898.6,180363.5,179967.5,180706.7,181105.3,182441.8,180721.8,180186.3,179264.7,180414.9,179773.0,181347.9,180076.0,179928.7,183441.2,179941.6,179298.1,179127.5,180202.2,182704.9,179907.3,180800.2,179661.8,179435.7,179947.7,181141.2,179956.4,179185.5,181432.6,180950.9,180293.4,179813.1,181141.0,178463.9,182320.9,180506.7,179801.7,182691.5,182971.1,179652.6,180798.3,181541.9,178744.5,181651.3,181367.4,179463.7,180084.3,179340.4,180215.3,180204.5,182148.6,180077.0,181188.4,179860.3,182620.3,181292.2,179595.6,184210.0,179399.8,181914.9,180915.3,181477.9,180876.3,180243.4,181168.6,179500.0,179460.6,180624.6,181697.8,181024.2,180271.0,179063.4,180115.2,179723.5,178981.9,181391.3,179865.7,180031.0,180381.6,180642.1,179893.0,180752.9,184194.9,181279.3,181929.3,183009.2,181011.9,181995.4,181268.6,179734.8,180080.9,180679.4,179870.9,180981.2,180874.2,181126.7,182281.4,182634.6,184084.1,179297.7,181868.7,180778.3,185301.4,180032.8,180590.6,180946.5,179722.0,179063.4,179651.4,181828.9,179098.0,179969.7,181382.9,180944.8,179768.1,179313.1,180310.3,179654.7,186346.9,182618.5,181581.3,180111.9,180179.5,180133.4,183357.6,182807.0,184720.4,182778.7,179667.9,179884.9,183395.7,179383.5,181100.7,179909.5,178463.9,181137.5,181090.0,179875.8,179855.8,181016.9,180816.2,178421.2,180730.6
Actual value,200000.0,279500.0,149000.0,230000.0,129900.0,134800.0,179900.0,153000.0,160000.0,196500.0,438780.0,158000.0,202500.0,219500.0,225000.0,244000.0,129500.0,144900.0,126500.0,85000.0,128950.0,320000.0,180000.0,136000.0,155000.0,109500.0,412500.0,325624.0,172500.0,177000.0,127000.0,184000.0,235000.0,110000.0,143900.0,141000.0,210000.0,200000.0,106000.0,148000.0,403000.0,113000.0,120000.0,137000.0,180000.0,173000.0,231500.0,130000.0,120500.0,201000.0,244600.0,235000.0,167000.0,152000.0,149900.0,360000.0,270000.0,301000.0,87000.0,324000.0,145250.0,202500.0,202900.0,85000.0,198500.0,147000.0,159000.0,165000.0,132000.0,162000.0,219500.0,125000.0,215000.0,164990.0,258000.0,168000.0,115000.0,280000.0,215000.0,315000.0,139000.0,135000.0,119000.0,90350.0,118000.0,162900.0,110000.0,161000.0,62383.0,250000.0,190000.0,297000.0,132500.0,91300.0,184000.0,124500.0,96500.0,158000.0,159000.0,176000.0,132000.0,213250.0,179665.0,229000.0,263000.0,140000.0,170000.0,185000.0,268000.0,325000.0,135960.0,145000.0,131500.0,253293.0,152000.0,118500.0,261500.0,137500.0,183200.0,153000.0,124000.0,139400.0,93000.0,274000.0,226000.0,155000.0,230500.0,193500.0,168000.0,173500.0,165000.0,159434.0,335000.0,176000.0,312500.0,55000.0,275000.0,108000.0,93500.0,132000.0,179000.0,127500.0,270000.0,107000.0,162000.0,175900.0,128000.0,107900.0,160200.0,485000.0,200000.0,109900.0,118000.0,181000.0,224900.0,137000.0,139500.0,109500.0,189000.0,151000.0,139950.0,128000.0,153900.0,133500.0,131500.0,147000.0,109500.0,116000.0,157000.0,135500.0,268000.0,122900.0,118858.0,145000.0,135000.0,201000.0,145900.0,242000.0,239799.0,214000.0,124900.0,127500.0,145000.0,124000.0,185000.0,135000.0,178400.0,336000.0,126000.0,395192.0,195000.0,197000.0,121600.0,136500.0,91000.0,206000.0,100000.0,187500.0,176000.0,293077.0,197000.0,230000.0,315500.0,556581.0,145000.0,84900.0,255000.0,185850.0,248000.0,335000.0,194000.0,192000.0,197000.0,92900.0,106000.0,274725.0,188000.0,205000.0,134500.0,318000.0,140000.0,117500.0,102000.0,196000.0,80000.0,124000.0,185000.0,235000.0,186700.0,165000.0,148000.0,200000.0,290000.0,91500.0,134000.0,143000.0,145000.0,147000.0,367294.0,142000.0,195000.0,142500.0,224900.0,248328.0,244000.0,294000.0,127500.0,181000.0,181000.0,183900.0,122000.0,378500.0,139000.0,162000.0,68400.0,227000.0,169000.0,115000.0,225000.0,138000.0,111000.0,256000.0,176500.0,200000.0,155835.0,171000.0,134900.0,149900.0,315000.0,189000.0,144152.0,144000.0,274300.0,250000.0,91000.0,58500.0,237500.0,112000.0,105000.0,250000.0,131000.0,163000.0,275000.0,90000.0,207000.0,271000.0,64500.0,160000.0,174000.0,197000.0,179600.0,92000.0
