In [5]:
import pandas as pd

# Load dataset
data = pd.read_csv('train.csv')

# Display first 5 rows
print(data.head())

# Show summary of dataset
print(data.info())
print(data.describe())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [6]:
# Check for missing values in each column
missing_values = data.isnull().sum().sort_values(ascending=False)
print(missing_values[missing_values > 0])

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageQual        81
GarageFinish      81
GarageType        81
GarageYrBlt       81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtCond          37
BsmtQual          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64


In [7]:
# Drop columns with many missing values (optional, or you can impute if you want)
data = data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1)

# Fill missing values for LotFrontage with median
data['LotFrontage'] = data['LotFrontage'].fillna(data['LotFrontage'].median())

# Fill missing values for Garage columns with "None"
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    data[col] = data[col].fillna('None')

# Fill missing values for Electrical with mode
data['Electrical'] = data['Electrical'].fillna(data['Electrical'].mode()[0])

# Check again for missing values
print(data.isnull().sum().sort_values(ascending=False).head(10))


MasVnrType      872
GarageYrBlt      81
BsmtExposure     38
BsmtFinType2     38
BsmtFinType1     37
BsmtCond         37
BsmtQual         37
MasVnrArea        8
Id                0
MSSubClass        0
dtype: int64


In [8]:
# Fill MasVnrType (categorical) with 'None'
data['MasVnrType'] = data['MasVnrType'].fillna('None')

# Fill MasVnrArea (numeric) with 0 (means no masonry veneer)
data['MasVnrArea'] = data['MasVnrArea'].fillna(0)

# Fill GarageYrBlt with year built (simplest assumption for missing values)
data['GarageYrBlt'] = data['GarageYrBlt'].fillna(data['YearBuilt'])

# Basement-related features: fill categorical with 'None', numerical with 0
basement_cols = ['BsmtExposure', 'BsmtFinType2', 'BsmtFinType1', 'BsmtCond', 'BsmtQual']
for col in basement_cols:
    data[col] = data[col].fillna('None')

# Now, check again for missing data
print(data.isnull().sum().sort_values(ascending=False).head(10))

Id             0
MSSubClass     0
MSZoning       0
LotFrontage    0
LotArea        0
Street         0
LotShape       0
LandContour    0
Utilities      0
LotConfig      0
dtype: int64


In [9]:
data = data.drop('Id',axis=1)

In [10]:
data= pd.get_dummies(data)

In [11]:
X = data.drop('SalePrice', axis=1)  # Features
y = data['SalePrice']               # Target

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [14]:
from sklearn.metrics import mean_squared_error

# Predict on test data
y_pred = model.predict(X_test)

# Root Mean Squared Error (RMSE)
import numpy as np
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", rmse)

# R^2 Score
print("R^2 Score:", model.score(X_test, y_test))

Root Mean Squared Error (RMSE): 29525.962005885798
R^2 Score: 0.8863434997645669
