In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("Resources/AHM_cleaned.csv")
AHM_cleaned_df = pd.read_csv(file_path)
AHM_cleaned_df.head()

Unnamed: 0,zpid,zipcode,latitude,longitude,latestPrice,livingAreaSqFt,latest_salemonth,latest_saleyear
0,111373431,78660,30.430632,-97.663078,305000.0,2601.0,9,2019
1,120900430,78660,30.432673,-97.661697,295000.0,1768.0,10,2020
2,2084491383,78660,30.409748,-97.639771,256125.0,1478.0,7,2019
3,120901374,78660,30.432112,-97.661659,240000.0,1678.0,8,2018
4,60134862,78660,30.437368,-97.65686,239900.0,2132.0,10,2018


In [3]:
# Group large dimensional categories into smaller categories (livingAreaSqFt, latestPrice)
# Find the median, maximum and minimum of these categories
AHM_cleaned_df["livingAreaSqFt"].mean()

2208.3233142179156

In [4]:
# Find the max value
AHM_cleaned_df["livingAreaSqFt"].max()

109292.0

In [5]:
# Find the minimum value
AHM_cleaned_df["livingAreaSqFt"].min()

300.0

In [6]:
# Sort by living AreaSqFt
sorted_df = AHM_cleaned_df.sort_values('livingAreaSqFt')

In [7]:
sorted_df.head()

Unnamed: 0,zpid,zipcode,latitude,longitude,latestPrice,livingAreaSqFt,latest_salemonth,latest_saleyear
1170,94641643,78734,30.381195,-97.915985,850000.0,300.0,7,2018
3518,29420819,78753,30.35033,-97.697739,139000.0,306.0,7,2019
13314,29330022,78703,30.284014,-97.762421,175000.0,378.0,10,2019
10082,29383447,78702,30.254911,-97.709785,300000.0,396.0,7,2018
10118,29385318,78702,30.264668,-97.71759,499000.0,450.0,10,2019


In [8]:
# Create groups small, medium, large homes
AHM_cleaned_df.loc[AHM_cleaned_df['livingAreaSqFt'] < 1800, 'sizeOfHome'] = 0
AHM_cleaned_df.loc[AHM_cleaned_df['livingAreaSqFt'] >= 1800, 'sizeOfHome'] = 1
AHM_cleaned_df.head()

Unnamed: 0,zpid,zipcode,latitude,longitude,latestPrice,livingAreaSqFt,latest_salemonth,latest_saleyear,sizeOfHome
0,111373431,78660,30.430632,-97.663078,305000.0,2601.0,9,2019,1.0
1,120900430,78660,30.432673,-97.661697,295000.0,1768.0,10,2020,0.0
2,2084491383,78660,30.409748,-97.639771,256125.0,1478.0,7,2019,0.0
3,120901374,78660,30.432112,-97.661659,240000.0,1678.0,8,2018,0.0
4,60134862,78660,30.437368,-97.65686,239900.0,2132.0,10,2018,1.0


In [25]:
AHM_cleaned_df["latestPrice"].mean()

512767.7437215741

In [26]:
AHM_cleaned_df["latestPrice"].max()

13500000.0

In [27]:
AHM_cleaned_df["latestPrice"].min()

5500.0

In [28]:
# Create groups small, medium, expensive = 1, inexpensive = 0
AHM_cleaned_df.loc[AHM_cleaned_df['latestPrice'] < 512768, 'priceOfHome'] = 0
AHM_cleaned_df.loc[AHM_cleaned_df['latestPrice'] >= 512768, 'priceOfHome'] = 1
AHM_cleaned_df.head()

Unnamed: 0,zipcode,latitude,longitude,latestPrice,latest_salemonth,latest_saleyear,sizeOfHome,priceOfHome
0,78660,30.430632,-97.663078,305000.0,9,2019,1.0,0.0
1,78660,30.432673,-97.661697,295000.0,10,2020,0.0,0.0
2,78660,30.409748,-97.639771,256125.0,7,2019,0.0,0.0
3,78660,30.432112,-97.661659,240000.0,8,2018,0.0,0.0
4,78660,30.437368,-97.65686,239900.0,10,2018,1.0,0.0


In [29]:
# Drop zpid column since it has no impact on the target
AHM_cleaned_df = AHM_cleaned_df.drop('zpid', axis = 1)
AHM_cleaned_df.head()

KeyError: "['zpid'] not found in axis"

In [11]:
# Drop livingAreaSqFt
AHM_cleaned_df = AHM_cleaned_df.drop('livingAreaSqFt', axis = 1)

In [30]:
# Drop livingAreaSqFt
AHM_cleaned_df = AHM_cleaned_df.drop('latestPrice', axis = 1)

In [33]:
# Define features set (delete the target column in this case "bad") 
# X is the input
X = AHM_cleaned_df.copy()
X = X.drop("priceOfHome", axis=1)
X.head()

Unnamed: 0,zipcode,latitude,longitude,latest_salemonth,latest_saleyear,sizeOfHome
0,78660,30.430632,-97.663078,9,2019,1.0
1,78660,30.432673,-97.661697,10,2020,0.0
2,78660,30.409748,-97.639771,7,2019,0.0
3,78660,30.432112,-97.661659,8,2018,0.0
4,78660,30.437368,-97.65686,10,2018,1.0


In [35]:
# Define target vector
y = AHM_cleaned_df["priceOfHome"].values.reshape(-1, 1)
y[:5]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [36]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [37]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11378, 6)
(3793, 6)
(11378, 1)
(3793, 1)


In [38]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [39]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(12136, 6)
(3035, 6)
(12136, 1)
(3035, 1)


In [40]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [41]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [42]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [44]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [45]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)
predictions

array([0., 0., 0., ..., 1., 0., 0.])

In [46]:
# Calculating the accuracy score
#(True Positives (TP) + True Negatives (TN)) / Total = (51 + 20)/125 = 0.568
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8436593725283417