# real estate house-price predictor

In [None]:
import pandas as pd

In [None]:
import vpython as vs

In [None]:
col_names=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
housing=pd.read_csv('housing.csv',names=col_names,header=None)
features = ['MEDV','RM','ZN','LSTAT']

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing.shape

In [None]:
#Checking null values in dataset
housing.isnull().sum()

In [None]:
import seaborn as sns
sns.heatmap(data=housing.isnull(),yticklabels=False,cmap='viridis')

# Basically, we all  have numerical values in which we have discrete & continuous variables

In [None]:
#Discrete features
discrete_feature=[feature for feature in housing if len(housing[feature].unique())<25]
print(f"Discrete feature count {len(discrete_feature)}")

In [None]:
housing[discrete_feature].head()

# Finding the realtionship between sales and discrete variable

In [None]:
import matplotlib.pyplot as plt
for feature in discrete_feature:
    data=housing.copy()
    data.groupby(feature)['MEDV'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

In [None]:
#Continuous features
continuous_feature=[feature for feature in housing if feature not in discrete_feature]
print(f"Continuous variable count {len(continuous_feature)}")

In [None]:
housing[continuous_feature].head()

In [None]:
for feature in continuous_feature:
    data=housing.copy()
    data[feature].hist(bins=30)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

In [None]:
housing.describe()

In [None]:
%matplotlib inline

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
housing.hist(bins=30 , figsize=(20,15))

## finding correlations 

In [None]:
corr=housing.corr()

In [None]:
corr['MEDV'].sort_values(ascending=False)

In [None]:
sns.pairplot(housing[features], height=2.5)
plt.tight_layout()

In [None]:
housing.plot(kind ='scatter', x ='RM', y ='MEDV', alpha = 0.8)

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(housing.corr(),annot=True,cmap=plt.cm.CMRmap_r)

In [None]:
## As we can see that TAX and RAD are highly correlated which is 91% so we can remove any one of them

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["MEDV", "RM", "ZN", "LSTAT"]
scatter_matrix(housing[attributes], figsize = (12,10))

In [None]:
housing.plot(kind="scatter", x="RM", y="MEDV", alpha=0.8)
## as we can see RMs 5,6,7 having MEDV value is 50 we can remove this outlier 

# Outliers

In [None]:
for feature in housing.columns:
    data=housing.copy()
    data.boxplot(column=feature)
    plt.show()

In [None]:
## applying log transformation
import numpy as np
for feature in continuous_feature:
    data=housing.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data['MEDV']=np.log(data['MEDV'])
        plt.scatter(data[feature],data['MEDV'])
        plt.xlabel(feature)
        plt.ylabel('MEDV')
        plt.title(feature)
        plt.show()

In [None]:
#after applying log transgormation we can see the boxplot in some of the features outliers are removed

for feature in continuous_feature:
    data=housing.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

In [None]:
mins,maxs=housing.MEDV.quantile([0.001,0.999])
mins,maxs
housing=housing[(housing.MEDV < maxs) & (housing.MEDV > mins)]

In [None]:
## after removing the outlier
housing.plot(kind="scatter", x="RM", y="MEDV", alpha=0.8)

In [None]:
housing.shape

In [None]:
housing.head()

## train-test splitting

In [None]:
import numpy as np

In [None]:
X=housing.iloc[:,:-1]
y=housing.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train=X_train.copy()

In [None]:
## most of the data is skewed so for that we apply log transformation
import numpy as np
num_features=['CRIM','NOX','DIS','TAX','LSTAT','B','RM','PTRATIO','INDUS']
for feature in num_features:
    X_train[feature]=np.log(X_train[feature])

In [None]:
X_train.head()

In [None]:
import seaborn as sns
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = X_train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X_train, 0.85)
print(len(set(corr_features)))
corr_features

In [None]:
X_train.drop('RAD',axis=1,inplace=True) #drop feature RAD

In [None]:
X_train.head()

Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_train_SC=scaler.fit_transform(X_train)

In [None]:
X_train_SC

In [None]:
X_train=pd.DataFrame(X_train_SC,columns=X_train.columns)

In [None]:
X_train.head()

Feature Selection

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=42)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

In [None]:
selected_feat = X_train.columns[(feature_sel_model.get_support())]
selected_feat

Making changes in test data applying log and standard scaler

In [None]:
#### Testing on the test data
#### applying log on test data
X_test=X_test.copy()
import numpy as np
num_features=['CRIM','NOX','DIS','TAX','LSTAT','B','RM','PTRATIO','INDUS']
for feature in num_features:
    X_test[feature]=np.log(X_test[feature])

X_test.drop('RAD',axis=1,inplace=True) #drop feature RAD

#### applying SC on test data
X_test_SC=scaler.transform(X_test)
X_test=pd.DataFrame(X_test_SC,columns=X_test.columns)

In [None]:
X_test.head()

In [None]:
X_test.shape

# Selecting desired model

Using Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

Using Metrics

In [None]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
predictions1=model.predict(X_train)
r2=r2_score(y_train,predictions1)
mse=mean_squared_error(y_train,predictions1)
mae=mean_absolute_error(y_train,predictions1)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

Using Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
modellasso=Lasso()
modellasso.fit(X_train,y_train)

Using Metrics

In [None]:
predictions2=modellasso.predict(X_train)
r2=r2_score(y_train,predictions2)
mse=mean_squared_error(y_train,predictions2)
mae=mean_absolute_error(y_train,predictions2)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

Using K Nearest Neighbour

In [None]:
from sklearn.neighbors import KNeighborsRegressor
modelKNN=KNeighborsRegressor(5)
modelKNN.fit(X_train,y_train)

Using Metrics

In [None]:
predictions3=modelKNN.predict(X_train)
r2=r2_score(y_train,predictions3)
mse=mean_squared_error(y_train,predictions3)
mae=mean_absolute_error(y_train,predictions3)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

Using Decison Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
modelDT=DecisionTreeRegressor()
modelDT.fit(X_train,y_train)

Using Metrics

In [None]:
predictions4=modelDT.predict(X_train)
r2=r2_score(y_train,predictions4)
mse=mean_squared_error(y_train,predictions4)
mae=mean_absolute_error(y_train,predictions4)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

# Overfitting occurs we will do cross validation 

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(modelDT, X_train,y_train, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores.mean())

Using Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
modelRF=RandomForestRegressor()
modelRF.fit(X_train,y_train)

Using Metrics

In [None]:
predictions5=modelRF.predict(X_train)
r2=r2_score(y_train,predictions5)
mse=mean_squared_error(y_train,predictions5)
mae=mean_absolute_error(y_train,predictions5)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

Testing on Test data

In [None]:
#KNN model on test data
predictionsKNN=modelKNN.predict(X_test)
r2=r2_score(y_test,predictionsKNN)
mse=mean_squared_error(y_test,predictionsKNN)
mae=mean_absolute_error(y_test,predictionsKNN)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

In [None]:
#Decison tree on test data
predictionsDT=modelDT.predict(X_test)
r2=r2_score(y_test,predictionsDT)
mse=mean_squared_error(y_test,predictionsDT)
mae=mean_absolute_error(y_test,predictionsDT)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

In [None]:
#Random Forest tree on test data
predictionsRF=modelRF.predict(X_test)
r2=r2_score(y_test,predictionsRF)
mse=mean_squared_error(y_test,predictionsRF)
mae=mean_absolute_error(y_test,predictionsRF)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

# Random Forest Regressor is showing high accuracy and less RMSE both on test and train data.

Hypertuning


In [None]:
### using best parameters
model1=RandomForestRegressor(n_estimators=1100,max_depth=900,max_features='sqrt',min_samples_leaf=1,min_samples_split=2)
model1.fit(X_train,y_train)

Using Metrics

In [None]:
#after hypertuning Random Forest tree on train data
predictionsRF1=model1.predict(X_train)
r2=r2_score(y_train,predictionsRF1)
mse=mean_squared_error(y_train,predictionsRF1)
mae=mean_absolute_error(y_train,predictionsRF1)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

In [None]:
#after hypertuning Random Forest tree on test data
predictionsRF2=model1.predict(X_test)
r2=r2_score(y_test,predictionsRF2)
mse=mean_squared_error(y_test,predictionsRF2)
mae=mean_absolute_error(y_test,predictionsRF2)
rmse=np.sqrt(mse)
print(f"R Squared = {r2} \nMean Squared Error = {mse} \nMean Absolute Error = {mae} \nRoot Mean Squared Error = {rmse}")

In [None]:
sns.histplot(y_test-predictionsRF2)

In [None]:
plt.scatter(y_test,predictionsRF2)

In [None]:
sns.regplot(x=y_test,y=predictionsRF2,scatter=True,marker='*')

In [None]:
import pickle
file=open('RFHouseModel.pkl','wb')
pickle.dump(model1,file)

In [None]:
pickle.dump(scaler,open('Scaler.pkl','wb'))

In [None]:
#loaded_model=pickle.load(open('RFHouseModel.pkl', 'rb'))

In [None]:
#input_data =(0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,296,15.3,396.90,4.98)