In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import numpy as np

In [2]:
# Import Clean House Data
house_data = pd.read_csv("Raw Data/clean_house_data.csv",low_memory = False)
imp_df = pd.DataFrame(house_data)

imp_df.head()

Unnamed: 0,BATHRM,HF_BATHRM,AC,ROOMS,BEDRM,SALEDATE,KITCHENS,FIREPLACES,LANDAREA,LONGITUDE,LATITUDE,QUADRANT,PRICE,ZIPCODE,WARD
0,3,1,Y,9,5,2016.0,2.0,4,1680,-77.040678,38.914684,NW,2100000.0,20009.0,2
1,3,1,Y,8,5,2006.0,2.0,3,1680,-77.040629,38.914683,NW,1602000.0,20009.0,2
2,3,2,Y,10,5,2010.0,1.0,4,2196,-77.039715,38.914331,NW,1950000.0,20009.0,2
3,3,1,Y,8,4,2011.0,2.0,1,1627,-77.040129,38.915408,NW,1050000.0,20009.0,2
4,3,1,Y,7,3,2018.0,2.0,1,1424,-77.039903,38.915017,NW,1430000.0,20009.0,2


In [3]:
imp_df = imp_df.drop(columns=['LONGITUDE','LATITUDE'])
imp_df.dtypes

BATHRM          int64
HF_BATHRM       int64
AC             object
ROOMS           int64
BEDRM           int64
SALEDATE      float64
KITCHENS      float64
FIREPLACES      int64
LANDAREA        int64
QUADRANT       object
PRICE         float64
ZIPCODE       float64
WARD            int64
dtype: object

In [4]:
imp_cat = imp_df.dtypes[imp_df.dtypes == "object"].index.tolist()


In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(imp_df[imp_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(imp_cat)
encode_df.head()



Unnamed: 0,AC_0,AC_N,AC_Y,QUADRANT_NE,QUADRANT_NW,QUADRANT_SE,QUADRANT_SW
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [6]:
encode_df = encode_df.drop(columns = 'AC_0')
# Merge one-hot encoded features and drop the originals
imp_df = imp_df.merge(encode_df,left_index=True, right_index=True)
imp_df = imp_df.drop(imp_cat,1)
imp_df.head()

  after removing the cwd from sys.path.


Unnamed: 0,BATHRM,HF_BATHRM,ROOMS,BEDRM,SALEDATE,KITCHENS,FIREPLACES,LANDAREA,PRICE,ZIPCODE,WARD,AC_N,AC_Y,QUADRANT_NE,QUADRANT_NW,QUADRANT_SE,QUADRANT_SW
0,3,1,9,5,2016.0,2.0,4,1680,2100000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
1,3,1,8,5,2006.0,2.0,3,1680,1602000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
2,3,2,10,5,2010.0,1.0,4,2196,1950000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
3,3,1,8,4,2011.0,2.0,1,1627,1050000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
4,3,1,7,3,2018.0,2.0,1,1424,1430000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0


In [7]:
# Dropping rows with a saledate less than 2000
new_df = imp_df[imp_df.SALEDATE >= 2000]
new_df

Unnamed: 0,BATHRM,HF_BATHRM,ROOMS,BEDRM,SALEDATE,KITCHENS,FIREPLACES,LANDAREA,PRICE,ZIPCODE,WARD,AC_N,AC_Y,QUADRANT_NE,QUADRANT_NW,QUADRANT_SE,QUADRANT_SW
0,3,1,9,5,2016.0,2.0,4,1680,2100000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
1,3,1,8,5,2006.0,2.0,3,1680,1602000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
2,3,2,10,5,2010.0,1.0,4,2196,1950000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
3,3,1,8,4,2011.0,2.0,1,1627,1050000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
4,3,1,7,3,2018.0,2.0,1,1424,1430000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55023,2,0,10,4,2004.0,2.0,0,5837,95000.0,20032.0,8,1.0,0.0,0.0,0.0,0.0,1.0
55024,2,0,10,4,2016.0,2.0,0,5302,185000.0,20032.0,8,0.0,1.0,0.0,0.0,0.0,1.0
55025,2,0,10,5,2012.0,2.0,0,5348,100000.0,20032.0,8,1.0,0.0,0.0,0.0,0.0,1.0
55026,2,0,10,4,2017.0,2.0,0,3046,215000.0,20032.0,8,1.0,0.0,0.0,0.0,0.0,1.0


In [8]:
remove_n = 35000
drop_indices = np.random.choice(new_df.index, remove_n, replace=False)
new_new_df = new_df.drop(drop_indices)
new_new_df

Unnamed: 0,BATHRM,HF_BATHRM,ROOMS,BEDRM,SALEDATE,KITCHENS,FIREPLACES,LANDAREA,PRICE,ZIPCODE,WARD,AC_N,AC_Y,QUADRANT_NE,QUADRANT_NW,QUADRANT_SE,QUADRANT_SW
0,3,1,9,5,2016.0,2.0,4,1680,2100000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
8,3,1,14,5,2016.0,1.0,3,2090,33232.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
9,1,0,6,3,2006.0,1.0,0,2090,907400.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
10,2,1,5,3,2012.0,1.0,1,2090,1065000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
17,3,1,9,4,2016.0,2.0,2,1853,1550000.0,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55000,2,1,6,3,2017.0,1.0,0,1834,350000.0,20032.0,8,0.0,1.0,0.0,0.0,0.0,1.0
55012,2,0,7,5,2016.0,1.0,2,4480,336000.0,20032.0,8,0.0,1.0,0.0,0.0,1.0,0.0
55021,1,1,7,3,2012.0,1.0,0,1919,215000.0,20032.0,8,0.0,1.0,0.0,0.0,1.0,0.0
55023,2,0,10,4,2004.0,2.0,0,5837,95000.0,20032.0,8,1.0,0.0,0.0,0.0,0.0,1.0


In [9]:
# Define the features 
X = new_new_df.copy()
X = X.drop("PRICE", axis = 1)
X.head()

Unnamed: 0,BATHRM,HF_BATHRM,ROOMS,BEDRM,SALEDATE,KITCHENS,FIREPLACES,LANDAREA,ZIPCODE,WARD,AC_N,AC_Y,QUADRANT_NE,QUADRANT_NW,QUADRANT_SE,QUADRANT_SW
0,3,1,9,5,2016.0,2.0,4,1680,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
8,3,1,14,5,2016.0,1.0,3,2090,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
9,1,0,6,3,2006.0,1.0,0,2090,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
10,2,1,5,3,2012.0,1.0,1,2090,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0
17,3,1,9,4,2016.0,2.0,2,1853,20009.0,2,0.0,1.0,0.0,1.0,0.0,0.0


In [10]:
# Define target set
y = new_new_df['PRICE'].ravel()
y[:5]

array([2100000.,   33232.,  907400., 1065000., 1550000.])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [12]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 1)

In [18]:
#Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [19]:
predictions = rf_model.predict(X_test_scaled)

In [20]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

ValueError: Shape of passed values is (2157, 2157), indices imply (2, 2)

In [21]:
acc_score = accuracy_score(y_test, predictions)

In [23]:
# Displaying results
print("Confusion Matrix")
#display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix
Accuracy Score : 0.011841326228537596
Classification Report
              precision    recall  f1-score   support

      5115.0       0.00      0.00      0.00         1
      7716.0       0.00      0.00      0.00         1
     10000.0       0.00      0.00      0.00         2
     18500.0       0.00      0.00      0.00         1
     20000.0       0.00      0.00      0.00         0
     21666.0       0.00      0.00      0.00         1
     24294.0       0.00      0.00      0.00         1
     26000.0       0.00      0.00      0.00         1
     30000.0       0.00      0.00      0.00         1
     33000.0       0.00      0.00      0.00         0
     33232.0       0.00      0.00      0.00         1
     34500.0       0.00      0.00      0.00         0
     38000.0       0.00      0.00      0.00         0
     40000.0       0.00      0.00      0.00         0
     43000.0       0.00      0.00      0.00         1
     46000.0       0.00      0.00      0.00         1
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# Calc feature importancy 
importancies = rf_model.feature_importances_
importancies

array([0.05541234, 0.05734224, 0.11439157, 0.0762582 , 0.18886745,
       0.02382584, 0.04164621, 0.29015321, 0.06632515, 0.03840452,
       0.01151181, 0.01166551, 0.00956689, 0.00518419, 0.00783079,
       0.00161408])

In [25]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2901532096773202, 'LANDAREA'),
 (0.18886744649106174, 'SALEDATE'),
 (0.1143915718285885, 'ROOMS'),
 (0.07625819781276409, 'BEDRM'),
 (0.0663251464160761, 'ZIPCODE'),
 (0.05734223796410527, 'HF_BATHRM'),
 (0.055412337211251, 'BATHRM'),
 (0.04164620680134112, 'FIREPLACES'),
 (0.038404524217277865, 'WARD'),
 (0.023825840919252123, 'KITCHENS'),
 (0.011665513936489444, 'AC_Y'),
 (0.011511810007529297, 'AC_N'),
 (0.009566890449560085, 'QUADRANT_NE'),
 (0.007830789930238974, 'QUADRANT_SE'),
 (0.0051841947920676975, 'QUADRANT_NW'),
 (0.001614081545076518, 'QUADRANT_SW')]