# Predicting The House Prices of Banglore Using LinearRegression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression



# Reading Data

In [None]:
data=pd.read_csv("/Users/jaypanchal/aiml/data/Bengaluru_House_Data.csv")

In [None]:
print(data.shape)
print(data.info())

In [None]:
data.head(5)

In [None]:
data.isnull().sum() # Contains many null values lets analyze and try to fill those

In [None]:
data=data.drop(columns=['availability','society'])

In [None]:
for col in ["location","size","bath","balcony"]:
   print(f"{col} : {data[col].value_counts()}")

In [None]:

def fill_na(data,data_cols):
    for col in data_cols:
        if col=="location":
            data[col]=data[col].fillna('Whitefield')
        elif col=="size":
            data[col]=data[col].fillna('2 BHK')
        elif col=="bath":
            data[col]=data[col].fillna(data[col].median) 
        elif col=="balcony":
            data[col] = data[col].fillna(data[col].mode()[0])
    return data
            
data_cols=["location","size","bath","balcony"]
data=fill_na(data,data_cols)

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.tail(5)

In [None]:
for i in data.columns:
    print(data[i].unique())

In [None]:
def convert_sqft(x):
    try:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return np.nan

data['total_sqft'] = data['total_sqft'].apply(convert_sqft)
data = data.dropna(subset=['total_sqft'])

In [None]:
data['bhk'] = data['size'].apply(lambda x: int(x.split(' ')[0]))
data = data.drop('size', axis=1)

In [None]:
data['bath'] = pd.to_numeric(data['bath'], errors='coerce')
data = data.dropna(subset=['bath'])
data['bath'] = data['bath'].astype(int)


In [None]:
data['balcony'] = data['balcony'].fillna(data['balcony'].median())
data['balcony'] = data['balcony'].astype(int)


In [None]:
data['location'] = data['location'].str.strip()
location_counts = data['location'].value_counts()
data['location'] = data['location'].apply(lambda x: 'other' if location_counts[x] <= 10 else x)
data = pd.get_dummies(data, columns=['area_type', 'location'], drop_first=True)

In [None]:
data = data[(data['total_sqft'] >= 300) & (data['total_sqft'] <= 10000)]

In [None]:
data = data[(data['price'] >= 10) & (data['price'] <= 500)]


In [None]:
data.head()

In [None]:
data.describe()

In [None]:
X = data.drop(['price'], axis=1)
y = data['price']

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

model = LinearRegression()
model.fit(X_train, y_train)


In [None]:

print("R2:", model.score(X_test, y_test))