In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import re

In [2]:
# Ingests original data source
dataset_original = pd.read_csv('India_housing_data_train.csv')

dataset_original.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


In [3]:
# Drops unwanted columns
dataset_cleaned = dataset_original.drop(labels=['LONGITUDE', 'LATITUDE'], axis=1) # Drops unwanted data

dataset_cleaned.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",60.5


In [4]:
# Create list of cities
city_lst = []
for i in dataset_cleaned['ADDRESS']:
    i = re.split(",", i)
    city_lst.append(i.pop(-1))

# Insert city list as column
dataset_cleaned.insert(8, "City", city_lst)

# Drop "ADDRESS" column
dataset_cleaned.drop(labels='ADDRESS', axis=1, inplace=True)

dataset_cleaned.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,City,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,Bangalore,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,Mysore,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,Bangalore,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,Ghaziabad,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,Kolkata,60.5


# Encode Categorical Variables

### - 'POSTED_BY'
### - 'BHK_OR_RK'

In [9]:
len(set(dataset_cleaned['City']))

256

In [13]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(dataset_cleaned['City'], x="City")
fig.update_xaxes(categoryorder="total descending")
fig.show()

In [52]:
X = dataset_cleaned.iloc[:, :-1].values
y = dataset_cleaned.iloc[:, -1].values

print(list(X[0]))
print(y)

['Owner', 0, 0, 2, 'BHK', 1300.236407, 1, 1, 'Ksfc Layout,Bangalore']
[55.  51.  43.  ... 27.1 67.  27.8]


In [54]:

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X[0])

[1.0 0.0 'Owner' 0 0 2 1300.236407 1 1 'Ksfc Layout,Bangalore']
