Step 1: Importing Necessary Libraries and Loading the Dataset


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
url = "https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data"
df = pd.read_csv('train.csv')

# Display the first few rows of the dataset and check columns
print(df.head())
print(df.columns)


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

Step 2: Data Preprocessing and Feature Selection


In [2]:
# Selecting features and target variable
X = df[['GrLivArea', 'BedroomAbvGr', 'FullBath']]
y = df['SalePrice']

# Check for missing values in selected columns
print(X.isnull().sum())

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


GrLivArea       0
BedroomAbvGr    0
FullBath        0
dtype: int64


Step 3: Building and Training the Linear Regression Model


In [3]:
# Initialize the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predictions on the training and testing sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


Step 4: Evaluating the Model Performance


In [4]:
# Calculate and print performance metrics
print("Training Set:")
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("R-squared Score:", r2_score(y_train, y_train_pred))

print("\nTesting Set:")
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("R-squared Score:", r2_score(y_test, y_test_pred))


Training Set:
Root Mean Squared Error (RMSE): 50924.807621834334
R-squared Score: 0.5652084622981051

Testing Set:
Root Mean Squared Error (RMSE): 52975.71771338122
R-squared Score: 0.6341189942328371


Step 5: Making Predictions


In [5]:
# Example prediction for a new house with 2000 sqft, 4 bedrooms, and 2 bathrooms
new_data = pd.DataFrame({'GrLivArea': [2000], 'BedroomAbvGr': [4], 'FullBath': [2]})
predicted_price = model.predict(new_data)
print("Predicted Price for the new house:", predicted_price[0])


Predicted Price for the new house: 213722.34944002156
