In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.cluster import DBSCAN

In [2]:
from collections import Counter

In [3]:
df = pd.read_csv('Bengaluru_House_Data.csv')
df[:5]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# Data cleaning and Processing

In [4]:
df1 = df.drop(['area_type','availability','society','balcony'],axis=1)
df1[:10]

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0
5,Whitefield,2 BHK,1170,2.0,38.0
6,Old Airport Road,4 BHK,2732,4.0,204.0
7,Rajaji Nagar,4 BHK,3300,4.0,600.0
8,Marathahalli,3 BHK,1310,3.0,63.25
9,Gandhi Bazar,6 Bedroom,1020,6.0,370.0


In [5]:
df1.isna().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [6]:
df1.shape

(13320, 5)

In [7]:
df2 = df1.dropna()

In [8]:
df2.shape

(13246, 5)

In [9]:
df2.isna().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [10]:
df2[:5]

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [12]:
df2['BHK']=df2["size"].apply(lambda x: x.split(" ")[0])
df2[:3]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['BHK']=df2["size"].apply(lambda x: x.split(" ")[0])


Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3


In [14]:
df3 = df2.drop(['size'],axis=1)

In [15]:
df3[:3]

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4
2,Uttarahalli,1440,2.0,62.0,3


In [39]:
value_counts=df3['location'].value_counts()

In [40]:
to_rename = value_counts[value_counts < 11].index.tolist()

In [41]:
to_rename

['Dairy Circle',
 'Kalkere',
 'Dodsworth Layout',
 'Sadashiva Nagar',
 'Naganathapura',
 'Nagadevanahalli',
 '1st Block Koramangala',
 'Gunjur Palya',
 'Nagappa Reddy Layout',
 'BTM 1st Stage',
 'Ganga Nagar',
 'Basapura',
 'Chennammana Kere',
 'Vishwanatha Nagenahalli',
 'Jakkur Plantation',
 '2nd Phase JP Nagar',
 'B Narayanapura',
 'Chandra Layout',
 'Lingarajapuram',
 'Kamakshipalya',
 'Gollahalli',
 'Vignana Nagar',
 '4th Block Koramangala',
 'Sector 1 HSR Layout',
 'Banagiri Nagar',
 'KUDLU MAIN ROAD',
 'Mathikere',
 'Volagerekallahalli',
 'Kaverappa Layout',
 'Richmond Town',
 'Yemlur',
 'Peenya',
 'Medahalli',
 'Ejipura',
 'Sathya Sai Layout',
 'Hongasandra',
 'Basavanapura',
 'Outer Ring Road East',
 'Nelamangala',
 'Seetharampalya',
 'Jalahalli West',
 'Nallurhalli',
 'Kodbisanhalli',
 'Hoysalanagar',
 'Akshayanagara East',
 'Shikaripalya',
 'Shanti Nagar',
 'Huskur',
 'Vasanth nagar',
 'Dodda Nekkundi Extension',
 'Mahalakshmi Puram',
 'Reliaable Tranquil Layout',
 'Kattigen

In [43]:
df3["location"].replace(to_rename, "Other", inplace=True)

In [44]:
df3['location'].value_counts()

Other                 2896
Whitefield             534
Sarjapur  Road         392
Electronic City        302
Kanakpura Road         266
                      ... 
Marsur                  11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: location, Length: 242, dtype: int64

In [46]:
df3[:15]

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4
2,Uttarahalli,1440,2.0,62.0,3
3,Lingadheeranahalli,1521,3.0,95.0,3
4,Kothanur,1200,2.0,51.0,2
5,Whitefield,1170,2.0,38.0,2
6,Old Airport Road,2732,4.0,204.0,4
7,Rajaji Nagar,3300,4.0,600.0,4
8,Marathahalli,1310,3.0,63.25,3
9,Other,1020,6.0,370.0,6


In [48]:
dummies = pd.get_dummies(df3.location)

In [49]:
dummies

Unnamed: 0,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13316,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13317,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
dummies.drop(['Other'],axis=1,inplace=True)

In [52]:
df4 = pd.concat([df3,dummies],axis=1)

In [53]:
df4.head()

Unnamed: 0,location,total_sqft,bath,price,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,Electronic City Phase II,1056,2.0,39.07,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Chikka Tirupathi,2600,5.0,120.0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Uttarahalli,1440,2.0,62.0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Lingadheeranahalli,1521,3.0,95.0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Kothanur,1200,2.0,51.0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
df4.drop(['location'],axis=1,inplace=True)

In [55]:
df4[:10]

Unnamed: 0,total_sqft,bath,price,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056,2.0,39.07,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2600,5.0,120.0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440,2.0,62.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1521,3.0,95.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200,2.0,51.0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1170,2.0,38.0,2,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,2732,4.0,204.0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3300,4.0,600.0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1310,3.0,63.25,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1020,6.0,370.0,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
df4.shape

(13246, 245)

In [67]:
df4.dtypes

total_sqft               object
bath                    float64
price                   float64
BHK                      object
 Devarachikkanahalli      uint8
                         ...   
Yelachenahalli            uint8
Yelahanka                 uint8
Yelahanka New Town        uint8
Yelenahalli               uint8
Yeshwanthpur              uint8
Length: 245, dtype: object

In [69]:
df4['BHK'] = df4['BHK'].astype(float)

In [77]:
def isfloat(x):
    try:
        float(x)
    except:
        return False
    return True

In [78]:
df4[~df4['total_sqft'].apply(isfloat)].head(10)

Unnamed: 0,total_sqft,bath,price,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
30,2100 - 2850,4.0,186.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
122,3067 - 8156,4.0,477.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
137,1042 - 1105,2.0,54.005,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
165,1145 - 1340,2.0,43.49,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188,1015 - 1540,2.0,56.8,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
410,34.46Sq. Meter,1.0,18.5,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
549,1195 - 1440,2.0,63.77,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
648,4125Perch,9.0,265.0,9.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
661,1120 - 1145,2.0,48.13,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
672,3090 - 5002,4.0,445.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
def convert(x):
    tokens = x.split('-')
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
        

In [95]:
df5 = df4.copy()

In [96]:
df5['total_sqft']=df5['total_sqft'].apply(convert)

In [97]:
df5.shape

(13246, 245)

In [99]:
df5.isna().sum()

total_sqft              46
bath                     0
price                    0
BHK                      0
 Devarachikkanahalli     0
                        ..
Yelachenahalli           0
Yelahanka                0
Yelahanka New Town       0
Yelenahalli              0
Yeshwanthpur             0
Length: 245, dtype: int64

In [100]:
df5[:10]

Unnamed: 0,total_sqft,bath,price,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056.0,2.0,39.07,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2600.0,5.0,120.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440.0,2.0,62.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1521.0,3.0,95.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200.0,2.0,51.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1170.0,2.0,38.0,2.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,2732.0,4.0,204.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3300.0,4.0,600.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1310.0,3.0,63.25,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1020.0,6.0,370.0,6.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
df6 = df5.dropna()

In [103]:
df6.shape

(13200, 245)

# NOW We apply  dbscan for outlier Detection

In [114]:
df6[:5]

Unnamed: 0,total_sqft,bath,price,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056.0,2.0,39.07,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2600.0,5.0,120.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440.0,2.0,62.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1521.0,3.0,95.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200.0,2.0,51.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
df6.dtypes

total_sqft              float64
bath                    float64
price                   float64
BHK                     float64
 Devarachikkanahalli      uint8
                         ...   
Yelachenahalli            uint8
Yelahanka                 uint8
Yelahanka New Town        uint8
Yelenahalli               uint8
Yeshwanthpur              uint8
Length: 245, dtype: object

In [116]:
from sklearn.preprocessing import StandardScaler

In [117]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df6)

In [152]:
epsilon = .5
min_samples = 20

In [153]:
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples)
labels = dbscan.fit_predict(X_scaled)

In [154]:
outliers = df6[labels==-1]

In [155]:
df7= df6[labels != -1]

In [156]:
df7.shape

(6839, 245)

In [157]:
outliers.shape

(6361, 245)

In [158]:
df6.shape

(13200, 245)

In [159]:
df7.head()

Unnamed: 0,total_sqft,bath,price,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056.0,2.0,39.07,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440.0,2.0,62.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200.0,2.0,51.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1170.0,2.0,38.0,2.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,1310.0,3.0,63.25,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
df7.describe()

Unnamed: 0,total_sqft,bath,price,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
count,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,...,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0,6839.0
mean,1293.038938,2.274017,72.943116,2.389969,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.004971,0.058196,0.0,0.02281,0.0,0.0,0.006872
std,374.219324,0.614785,41.684329,0.632658,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.070338,0.23413,0.0,0.149309,0.0,0.0,0.08262
min,11.0,1.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1080.0,2.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1210.0,2.0,61.95,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1464.5,3.0,87.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4111.0,5.0,365.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


In [171]:
df8 = df7[~(df7.total_sqft<280)]

In [172]:
df8.shape

(6837, 245)

In [173]:
df8.describe()

Unnamed: 0,total_sqft,bath,price,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
count,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,...,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0,6837.0
mean,1293.413382,2.274097,72.949242,2.390083,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.004973,0.058213,0.0,0.022817,0.0,0.0,0.006874
std,373.632919,0.614619,41.687188,0.632484,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.070349,0.234162,0.0,0.149331,0.0,0.0,0.082632
min,296.0,1.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1081.0,2.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1210.0,2.0,61.95,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1465.0,3.0,87.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4111.0,5.0,365.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


# Model building

In [181]:
x=df8.drop(['price'],axis=1)

In [182]:
y=df8.price

In [183]:
x.head()

Unnamed: 0,total_sqft,bath,BHK,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056.0,2.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440.0,2.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200.0,2.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1170.0,2.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,1310.0,3.0,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [184]:
y.head()

0    39.07
2    62.00
4    51.00
5    38.00
8    63.25
Name: price, dtype: float64

In [284]:
from sklearn.model_selection import train_test_split

In [285]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2)

In [286]:
from sklearn.preprocessing import MinMaxScaler

In [287]:
scaler1 = MinMaxScaler()

In [288]:
scaler1.fit(x_train)

MinMaxScaler()

In [289]:
X_train_scaled = scaler1.transform(x_train)
X_test_scaled = scaler1.transform(x_test)

In [290]:
from sklearn.linear_model import LinearRegression

In [291]:
lr = LinearRegression()

In [292]:
lr.fit(X_train_scaled,y_train)

LinearRegression()

In [293]:
lr.score(X_test_scaled,y_test)

0.5964150712932944

In [294]:
lr.score(X_train_scaled,y_train)

0.5932331215596001

In [295]:
from sklearn import linear_model
lasso = linear_model.Lasso()

In [296]:
lasso.fit(X_train_scaled,y_train)

Lasso()

In [297]:
lasso.score(X_test_scaled,y_test)

0.5031968398205324

In [298]:
from sklearn.linear_model import Ridge

In [299]:
rid = Ridge()

In [300]:
rid.fit(X_train_scaled,y_train)

Ridge()

In [301]:
rid.score(X_test_scaled,y_test)

0.5973189189554162

In [302]:
from sklearn import svm

In [303]:
clf = svm.SVR()

In [304]:
clf.fit(X_train_scaled,y_train)

SVR()

In [305]:
clf.score(X_test_scaled,y_test)

0.4864977197296042

In [306]:
from sklearn.ensemble import RandomForestRegressor

In [307]:
regressor = RandomForestRegressor(n_estimators=100,max_depth=30,random_state=42)

In [308]:
regressor.fit(X_train_scaled,y_train)

RandomForestRegressor(max_depth=30, random_state=42)

In [309]:
regressor.score(X_test_scaled,y_test)

0.582696357023762