In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import re

In [8]:
# Ingests original data source
dataset_original = pd.read_csv('India_housing_data_train.csv')

dataset_original.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


In [9]:
# Drops unwanted columns
dataset_cleaned = dataset_original.drop(labels=['LONGITUDE', 'LATITUDE'], axis=1) # Drops unwanted data

dataset_cleaned.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",60.5


In [10]:
# Create list of cities
city_lst = []
for i in dataset_cleaned['ADDRESS']:
    i = re.split(",", i)
    city_lst.append(i.pop(-1))

# Insert city list as column
dataset_cleaned.insert(8, "City", city_lst)

# Drop "ADDRESS" column
dataset_cleaned.drop(labels='ADDRESS', axis=1, inplace=True)

dataset_cleaned.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,City,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,Bangalore,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,Mysore,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,Bangalore,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,Ghaziabad,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,Kolkata,60.5


In [11]:
dataset_cleaned = dataset_cleaned.iloc[:,:].values

dataset_cleaned

array([['Owner', 0, 0, ..., 1, 'Bangalore', 55.0],
       ['Dealer', 0, 0, ..., 1, 'Mysore', 51.0],
       ['Owner', 0, 0, ..., 1, 'Bangalore', 43.0],
       ...,
       ['Dealer', 0, 0, ..., 1, 'Jaipur', 27.1],
       ['Owner', 0, 0, ..., 1, 'Chennai', 67.0],
       ['Dealer', 0, 1, ..., 1, 'Jaipur', 27.8]], dtype=object)

In [12]:
dataset_cleaned[:,0] = LabelEncoder().fit_transform(dataset_cleaned[:,0])

In [21]:
dataset_cleaned

array([[2, 0, 0, ..., 1, 'Bangalore', 55.0],
       [1, 0, 0, ..., 1, 'Mysore', 51.0],
       [2, 0, 0, ..., 1, 'Bangalore', 43.0],
       ...,
       [1, 0, 0, ..., 1, 'Jaipur', 27.1],
       [2, 0, 0, ..., 1, 'Chennai', 67.0],
       [1, 0, 1, ..., 1, 'Jaipur', 27.8]], dtype=object)

In [23]:
ct = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [0])], remainder='passthrough')

In [24]:
y = ct.fit_transform(dataset_cleaned)

In [26]:
y = pd.DataFrame(y)

y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,0.0,1.0,0,0,2,BHK,1300.236407,1,1,Bangalore,55.0
1,0.0,1.0,0.0,0,0,2,BHK,1275.0,1,1,Mysore,51.0
2,0.0,0.0,1.0,0,0,2,BHK,933.159722,1,1,Bangalore,43.0
3,0.0,0.0,1.0,0,1,2,BHK,929.921143,1,1,Ghaziabad,62.5
4,0.0,1.0,0.0,1,0,2,BHK,999.009247,0,1,Kolkata,60.5
...,...,...,...,...,...,...,...,...,...,...,...,...
29446,0.0,0.0,1.0,0,0,3,BHK,2500.0,1,1,Agra,45.0
29447,0.0,0.0,1.0,0,0,2,BHK,769.230769,1,1,Vapi,16.0
29448,0.0,1.0,0.0,0,0,2,BHK,1022.641509,1,1,Jaipur,27.1
29449,0.0,0.0,1.0,0,0,2,BHK,927.079009,1,1,Chennai,67.0


# Encode Categorical Variables

### - 'POSTED_BY'
### - 'BHK_OR_RK'

In [None]:
len(set(dataset_cleaned['City']))

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(dataset_cleaned['City'], x="City")
fig.update_xaxes(categoryorder="total descending")
fig.show()

In [None]:
X = dataset_cleaned.iloc[:, :-1].values
y = dataset_cleaned.iloc[:, -1].values

print(list(X[0]))
print(y)

In [None]:

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X[0])