<a href="https://colab.research.google.com/github/dewshishir/problem/blob/main/Assignment_7_Data_Preprocessing_and_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#Load the Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# TODO: Load the housing data
df = pd.read_csv('Housing.csv')

# TODO: How many houses are there?
print(f"Number of houses: {df.shape[0]}")

# TODO: Show first 3 houses
df.head(3)

Number of houses: 545


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished


In [5]:
# Look at Columns
print("Columns:")
print(df.columns)

# Show data types of columns
print("\nData types:")
print(df.dtypes)


Columns:
Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

Data types:
price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object


In [11]:
#Convert Yes/No to 1/0
# Check what's in our data
print("Current mainroad values:", df['mainroad'].head())
# Shows: yes, no, yes, yes, no
binary_cols = [
    'mainroad', 'guestroom', 'basement',
    'hotwaterheating', 'airconditioning',
    'prefarea'
]

for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})
# Models need numbers, not text!
# We need: yes → 1, no → 0
print("Converted mainroad values:")
print(df['mainroad'].head())

Current mainroad values: 0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: mainroad, dtype: float64
Converted mainroad values:
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: mainroad, dtype: float64


In [13]:
# Split into Train and Test
# TODO: Split - 80% train, 20% test
from sklearn.model_selection import train_test_split

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=30
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (436, 12)
Test size: (109, 12)


In [21]:
#Apply StandardScaler
# Area is huge (1650-13300), Bedrooms is small (1-6)!
from sklearn.preprocessing import StandardScaler
import pandas as pd # Import pandas for get_dummies and fillna

# Handle categorical columns first, specifically 'furnishingstatus'
# Get dummy variables for 'furnishingstatus'
# drop_first=True avoids multicollinearity
X_train_processed = pd.get_dummies(X_train, columns=['furnishingstatus'], drop_first=True)
X_test_processed = pd.get_dummies(X_test, columns=['furnishingstatus'], drop_first=True)

# Address NaNs in the binary columns (e.g., 'mainroad', 'guestroom')
# These NaNs arose because the previous conversion of 'yes'/'no' to 1/0
# might not have been applied correctly or the DataFrame was reloaded.
# Filling NaNs with 0 (assuming 'no' for binary features where data is missing).
binary_cols_with_nans = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for col in binary_cols_with_nans:
    if col in X_train_processed.columns:
        X_train_processed[col] = X_train_processed[col].fillna(0).astype(int) # Ensure type is int
    if col in X_test_processed.columns:
        X_test_processed[col] = X_test_processed[col].fillna(0).astype(int) # Ensure type is int

scaler = StandardScaler()

# Apply scaling to the processed (numerical) dataframes
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)

print("Standard scaling applied.")


Standard scaling applied.


In [22]:
#train and test model
#your code here
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


R2 Score: 0.4729396125282729
MSE: 1772114213033.276


In [23]:
# Predict a House Price
# New house details:
# area=5000, bedrooms=3, bathrooms=2, stories=2,
# mainroad=yes, guestroom=no, basement=yes,
# hotwaterheating=no, airconditioning=yes,
# parking=2, prefarea=yes, furnishingstatus=furnished
#your code here
# New house details
new_house = pd.DataFrame([{
    'area': 5000,
    'bedrooms': 3,
    'bathrooms': 2,
    'stories': 2,
    'mainroad': 1,
    'guestroom': 0,
    'basement': 1,
    'hotwaterheating': 0,
    'airconditioning': 1,
    'parking': 2,
    'prefarea': 1,
    'furnishingstatus_semi-furnished': 0,
    'furnishingstatus_unfurnished': 0
}])

# Scale and predict
new_house_scaled = scaler.transform(new_house)
predicted_price = model.predict(new_house_scaled)

print("Predicted House Price:", int(predicted_price[0]))


Predicted House Price: 6324499
