In [380]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

In [381]:
df = pd.read_csv('/content/train.csv')

In [382]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# **Preprocessing**

In [383]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [384]:
df.interpolate(method = 'linear', inplace = True)

In [385]:
df.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [386]:
df.dropna(axis=1, inplace = True)

# **Standardization**

In [387]:
numerical_columns = df.select_dtypes(include=['float64','int64']).columns
data = df[numerical_columns]

In [388]:
scaler = StandardScaler()

#transform the numerical columns
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

display(data.head(5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[numerical_columns] = scaler.fit_transform(data[numerical_columns])


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,-1.730865,0.073375,-0.225902,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511514,0.575425,...,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777,0.347273
1,-1.728492,-0.872563,0.425052,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.573359,1.171992,...,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439,0.007288
2,-1.72612,0.073375,-0.095711,0.07348,0.651479,-0.5172,0.984752,0.830215,0.323322,0.092907,...,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777,0.536154
3,-1.723747,0.309859,-0.442886,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.573359,-0.499274,...,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655,-0.515281
4,-1.721374,0.073375,0.59864,0.375148,1.374795,-0.5172,0.951632,0.733308,1.363915,0.463568,...,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777,0.869843


# **Seperate X and y**

In [389]:
X = data.drop(['SalePrice', 'Id'], axis=1)
y = data['SalePrice']

In [390]:
display(X.head(5))

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0.073375,-0.225902,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511514,0.575425,-0.288653,...,0.351,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777
1,-0.872563,0.425052,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.573359,1.171992,-0.288653,...,-0.060731,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439
2,0.073375,-0.095711,0.07348,0.651479,-0.5172,0.984752,0.830215,0.323322,0.092907,-0.288653,...,0.631726,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777
3,0.309859,-0.442886,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.573359,-0.499274,-0.288653,...,0.790804,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655
4,0.073375,0.59864,0.375148,1.374795,-0.5172,0.951632,0.733308,1.363915,0.463568,-0.288653,...,1.698485,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777


In [391]:
display(y.head(5))

0    0.347273
1    0.007288
2    0.536154
3   -0.515281
4    0.869843
Name: SalePrice, dtype: float64

# **PCA**

In [392]:
pca = PCA()
pca.fit(X)

In [393]:
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[1.97397381e-01 8.91661140e-02 7.09805377e-02 5.61929692e-02
 4.10224020e-02 3.31581311e-02 3.20522100e-02 3.14663423e-02
 3.07499011e-02 3.01711053e-02 2.91520875e-02 2.83524710e-02
 2.80394315e-02 2.65429442e-02 2.55609560e-02 2.49725479e-02
 2.35217603e-02 2.25725994e-02 2.17282965e-02 2.04488338e-02
 1.87259169e-02 1.74690492e-02 1.63736088e-02 1.52454762e-02
 1.21761279e-02 1.08149224e-02 8.65257675e-03 7.80851551e-03
 7.10126333e-03 6.83248963e-03 5.39107952e-03 3.96332838e-03
 3.55336240e-03 2.64326147e-03 3.89297691e-32 3.84189792e-33]


In [394]:
cumulative_variance = np.cumsum(explained_variance)

In [395]:
threshold = 0.9
num_components = np.argmax(cumulative_variance >= threshold) + 1
print(num_components)

23


In [396]:
pca = PCA(n_components=num_components)
X = pca.fit_transform(X)

In [397]:
X

array([[ 1.47244755,  0.35814056, -1.7020052 , ...,  0.05116864,
         0.48288077, -0.03668309],
       [-0.04379998, -1.04800885,  1.25229425, ..., -1.24113874,
        -0.8181452 , -0.26441637],
       [ 1.65736232,  0.23840966, -1.47714646, ...,  0.57343227,
        -0.22931713, -0.07340453],
       ...,
       [ 0.48682518,  3.34856564,  1.60845552, ...,  0.4863213 ,
        -1.51152217, -1.74745335],
       [-2.8223386 , -1.98808453,  1.98010747, ...,  0.53869814,
         0.60923056,  0.40467224],
       [-0.86685454, -1.48619011,  1.59580724, ..., -1.76546933,
         0.64276472,  1.63473991]])

# **Linear Regression**

In [398]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [399]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [400]:
model = LinearRegression()
model.fit(X_train, y_train)

In [401]:
y_pred = model.predict(X_test)
print(y_pred)

[ 5.84594628e-01  7.52130892e-01  1.04786024e-01 -6.86433037e-01
 -6.33747798e-01  7.61007389e-01 -3.75260497e-02  5.18421003e-01
  3.78983432e-01  1.04662275e+00  2.77389193e+00  5.02078528e-01
  1.21064520e-01  8.93009674e-01  9.90308476e-01  9.58290924e-01
  8.92247839e-01 -1.40918236e+00 -3.82937126e-01 -6.12230728e-01
 -8.88696533e-01  1.60723434e+00 -6.85982122e-01 -1.54097432e+00
 -6.21495011e-01  7.50917183e-01  4.58965243e-01 -9.70735279e-02
 -4.56120572e-01  4.73746926e-01 -4.86085463e-01 -3.17439448e-01
  2.34598687e-01 -1.12089477e+00 -1.06438921e+00 -1.54024500e+00
  8.25088013e-01  2.21957063e-01  2.01150429e-01 -6.45615248e-01
  3.83246360e-01 -3.46577186e-02  7.42255040e-02 -5.88597431e-01
  1.62037445e+00  2.30068000e-01 -2.14467465e+00 -9.81508076e-01
  7.01484958e-01  6.78377722e-01 -4.57927887e-01  5.21008900e-01
 -1.93579802e+00  2.00154538e-01  4.14049224e+00 -4.99675697e-01
 -6.37578386e-01  1.99815016e+00  5.99879652e-01  1.66252018e+00
  1.42652539e+00 -3.55820

In [402]:
mse = mean_squared_error(y_test, y_pred)
print(mse)

0.2165012795905229


In [403]:
mae = mean_absolute_error(y_test, y_pred)
print(mae)

0.2887806651941394
