In [1]:
import glob
import pandas as pd

# Pretend that this mimics an interface that gets the data from a generic
# source. In this case, we are getting the data from a csv file for the test
# case, being in a controlled environment. In a real-life scenario, we would
# get the data from a database, a web service, blob storage, etc.
csv_path = glob.glob('../data/*.csv')

# Dynamically load the data from the csv file.
df = pd.read_csv(csv_path[0])
df.head()

Unnamed: 0,uid,city,description,homeType,latitude,longitude,garageSpaces,hasSpa,yearBuilt,numOfPatioAndPorchFeatures,lotSizeSqFt,avgSchoolRating,MedianStudentsPerTeacher,numOfBathrooms,numOfBedrooms,priceRange
0,1748,austin,MULTIPLE OFFERS submit best & final to Agent b...,Single Family,30.380089,-97.800621,0,False,1988,0,102366.0,7.0,17,4.0,4,650000+
1,13380,austin,"4644 Hoffman Dr, Austin, TX 78749 is a single ...",Single Family,30.199486,-97.859947,0,False,1997,0,6534.0,6.666667,16,3.0,4,350000-450000
2,4115,austin,"6804 Canal St, Austin, TX 78741 is a single fa...",Single Family,30.227398,-97.696083,0,False,1952,0,5619.0,3.333333,11,1.0,2,0-250000
3,6926,austin,Beautiful large lot with established trees. Lo...,Single Family,30.205469,-97.792351,4,False,1976,0,6416.0,4.0,14,2.0,4,0-250000
4,14480,austin,Stunning NW Hills designer remodel by Cedar an...,Single Family,30.345106,-97.767426,2,False,1984,0,10759.0,7.0,16,3.0,5,650000+


In [2]:
# Here we can see that the data does not present lots of missing values. Actually,
# it presents only one missing value in the column 'description'. We'll investigate
# further to see if we can fill this value. Otherwise, we'll drop the row. This
# decision will have little impact in the final result, since we have 10000 rows
# in the dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   uid                         10000 non-null  int64  
 1   city                        10000 non-null  object 
 2   description                 9999 non-null   object 
 3   homeType                    10000 non-null  object 
 4   latitude                    10000 non-null  float64
 5   longitude                   10000 non-null  float64
 6   garageSpaces                10000 non-null  int64  
 7   hasSpa                      10000 non-null  bool   
 8   yearBuilt                   10000 non-null  int64  
 9   numOfPatioAndPorchFeatures  10000 non-null  int64  
 10  lotSizeSqFt                 10000 non-null  float64
 11  avgSchoolRating             10000 non-null  float64
 12  MedianStudentsPerTeacher    10000 non-null  int64  
 13  numOfBathrooms              1000

In [3]:
# Check in which record we'll have the missing value on the description column.
df[df.isna()['description']]

Unnamed: 0,uid,city,description,homeType,latitude,longitude,garageSpaces,hasSpa,yearBuilt,numOfPatioAndPorchFeatures,lotSizeSqFt,avgSchoolRating,MedianStudentsPerTeacher,numOfBathrooms,numOfBedrooms,priceRange
7016,13491,austin,,Single Family,30.189535,-97.897896,2,False,2002,1,10890.0,8.666667,18,3.0,4,450000-650000


In [4]:
# As we can see, the description is a string that contains various information
# about the product. From this perspective, it's unlike that we'll be able to
# fill the information on our own. Therefore, we'll drop the row.

# Other thing interesting analyzing the "uniqueness" of the values is that, even
# after dropping the row with the missing value, we still have 9983 unique values
# for the description column. This means that we have 17 duplicated descriptions
# that we need to check further.
len(df['description'].unique())

9983

In [5]:
# We can see that the duplicated descriptions aren't actually full duplicated
# rows. Knowing that, we don't need to drop them since they are actually
# different rows with different information for the model.
df.duplicated().sum()

# Drop the row with the missing value.
df.dropna(inplace=True)

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets. We'll use 20% of the data for testing
# and 80% for training.
train_and_val: pd.DataFrame = pd.DataFrame()
test: pd.DataFrame = pd.DataFrame()
train_and_val, test = train_test_split(df, test_size=0.2, random_state=42)

# Then, we'll split the train set into train and validation sets. We'll use 20%
# of the data for validation (representing 16% of the original data) and 80% for
# training (representing 64% of the original data).
train: pd.DataFrame = pd.DataFrame()
val: pd.DataFrame = pd.DataFrame()
train, val = train_test_split(train_and_val, test_size=0.2, random_state=42)

# From now on, we'll use the train set for all of the operations. The validation
# set will be used only to evaluate the model performance, and the test set will
# be used only once, after the model is trained, to evaluate the model performance
# in a real-life scenario.
print('Train set size: ', len(train))
print('Validation set size: ', len(val))
print('Test set size: ', len(test))

Train set size:  6399
Validation set size:  1600
Test set size:  2000


In [7]:
train.iloc[0].to_dict()

{'uid': 14249,
 'city': 'austin',
 'description': 'Beautiful well maintained home on huge lot with mature trees in highly sought after North Oaks Hillside subdivision.  Spacious, light abd bright family room with huge fireplace and high ceilings.  Gorgeous wood floors, plantation shutters, silestone countertops, crown molding and much more.  Conveniently located four bedroom home gently lived in on 1/4(+ or -) acre lot with trees.  Don miss it!',
 'homeType': 'Single Family',
 'latitude': 30.38235092163086,
 'longitude': -97.6693344116211,
 'garageSpaces': 0,
 'hasSpa': False,
 'yearBuilt': 1972,
 'numOfPatioAndPorchFeatures': 1,
 'lotSizeSqFt': 11325.6,
 'avgSchoolRating': 5.0,
 'MedianStudentsPerTeacher': 15,
 'numOfBathrooms': 2.0,
 'numOfBedrooms': 4,
 'priceRange': '250000-350000'}

In [8]:
# Get the categorical columns.
cat_cols = train.select_dtypes(include=['object']).columns.tolist()

for col in cat_cols:
    print(f"df['{col}']: {train[col].nunique()}")

# We can see that one of the categorical columns, 'description', has 6389 unique values,
# which is a lot. This means that we'll have a lot of columns after applying the
# OneHotEncoder. This could be a problem for the model performance, since we'll
# have a lot of columns with few values. Therefore, we'll drop this column later on
# the pipeline.
cat_cols.remove('description')

# Also, "priceRange" is our target column, so we'll remove it from the list of
# categorical columns.
cat_cols.remove('priceRange')

print(cat_cols)

df['city']: 6
df['description']: 6389
df['homeType']: 10
df['priceRange']: 5
['city', 'homeType']


In [9]:
# Get the numerical columns.
num_cols = train.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()

# We'll remove the "uid" column, since it's just an identifier for the product.
num_cols.remove('uid')

num_cols

['latitude',
 'longitude',
 'garageSpaces',
 'hasSpa',
 'yearBuilt',
 'numOfPatioAndPorchFeatures',
 'lotSizeSqFt',
 'avgSchoolRating',
 'MedianStudentsPerTeacher',
 'numOfBathrooms',
 'numOfBedrooms']

In [10]:
# Now, we'll separate the train set into X and y. X will be the input data and y
# will be the target data. Our goal is to predict the price range of a product
# based on the existing features.
X = train.drop('priceRange', axis=1)
y = train['priceRange']

# Create a LabelEncoder for the target column.
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

classes_map = {}
for i, cls in enumerate(label_encoder.classes_):
    classes_map[i] = cls

import json

with open('../data/classes_map.json', 'w') as f:
    json.dump(classes_map, f)

classes_map

{0: '0-250000',
 1: '250000-350000',
 2: '350000-450000',
 3: '450000-650000',
 4: '650000+'}

In [11]:
# Now, we'll create a pipeline to transform the data. We'll use the ColumnTransformer
# to apply different transformations to different columns. We'll use the OneHotEncoder
# to transform the categorical columns and the StandardScaler to transform the
# numerical columns.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

categorical_transformer = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('StandardScaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('drop_columns', 'drop', ['uid', 'description']),
    ('categorical_transformer', categorical_transformer, cat_cols),
    ('numerical_transformer', numerical_transformer, num_cols)
])

pipeline = Pipeline(steps=[
    ('transform', preprocessor),
    ('model', None)
])

X_transformed = pd.DataFrame(pipeline.fit_transform(X), columns=pipeline.named_steps['transform'].get_feature_names_out())
X_transformed

Unnamed: 0,categorical_transformer__city_austin,categorical_transformer__city_del valle,categorical_transformer__city_driftwood,categorical_transformer__city_dripping springs,categorical_transformer__city_manchaca,categorical_transformer__city_pflugerville,categorical_transformer__homeType_Apartment,categorical_transformer__homeType_Condo,categorical_transformer__homeType_Mobile / Manufactured,categorical_transformer__homeType_MultiFamily,...,numerical_transformer__longitude,numerical_transformer__garageSpaces,numerical_transformer__hasSpa,numerical_transformer__yearBuilt,numerical_transformer__numOfPatioAndPorchFeatures,numerical_transformer__lotSizeSqFt,numerical_transformer__avgSchoolRating,numerical_transformer__MedianStudentsPerTeacher,numerical_transformer__numOfBathrooms,numerical_transformer__numOfBedrooms
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.295243,-0.921800,-0.306953,-0.783921,0.319932,-0.024207,-0.419570,0.082920,-0.711646,0.668988
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030187,2.074312,3.257826,-0.597087,2.326053,-0.027970,1.730313,0.654692,-0.711646,-0.552717
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.666307,0.576256,-0.306953,0.150250,-0.683128,-0.037058,0.476215,0.654692,0.300540,-0.552717
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.754902,0.576256,-0.306953,-0.316835,1.322993,-0.031259,-0.598727,-0.488852,-0.711646,-0.552717
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.941295,-0.921800,-0.306953,-0.223418,-0.683128,-0.024991,0.834529,1.226463,-0.711646,0.668988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6394,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.658999,0.576256,3.257826,1.224546,-0.683128,-0.032826,-0.419570,0.082920,0.300540,-0.552717
6395,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.655317,-0.921800,-0.306953,1.317963,-0.683128,-0.021857,0.655372,0.654692,0.300540,0.668988
6396,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.723325,-0.921800,-0.306953,0.850878,-0.683128,-0.034395,-1.136197,0.082920,0.300540,0.668988
6397,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.089625,0.576256,-0.306953,0.850878,-0.683128,-0.034708,-0.240413,0.082920,0.300540,1.890692


In [12]:
# Now, we'll train a model using the transformed data. We'll use the RandomForestClassifier
# since it's a good model for classification problems and it's easy to train.
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

pipeline = Pipeline(steps=[
    ('transform', preprocessor),
    ('model', model)
])

pipeline.fit(X, y)

# Now, we'll evaluate the model performance using the validation set. We'll use
# the accuracy score as the metric to evaluate the model performance.
from sklearn.metrics import accuracy_score

y_pred = pipeline.predict(val)

accuracy_score(y_pred, label_encoder.transform(val['priceRange']))

0.60875

In [13]:
# Pickle the model.
import pickle

with open('../model/model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

# Now, we'll evaluate the model performance using the test set. We'll use
# the accuracy score as the metric to evaluate the model performance.

y_pred = pipeline.predict(test)

accuracy_score(y_pred, label_encoder.transform(test['priceRange']))

0.618

In [19]:
# Define most relevant features for the model.
feature_importances = pipeline.named_steps['model'].feature_importances_

# Get the names of the features.
feature_names = pipeline.named_steps['transform'].get_feature_names_out()

# Create a dataframe with the feature importances.
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})

# Sort the dataframe by the importance.
feature_importances_df.sort_values('importance', ascending=False, inplace=True)

# Get the top 10 most relevant features.
feature_importances_df.head(10)

Unnamed: 0,feature,importance
16,numerical_transformer__latitude,0.177464
17,numerical_transformer__longitude,0.172139
22,numerical_transformer__lotSizeSqFt,0.148809
20,numerical_transformer__yearBuilt,0.120172
23,numerical_transformer__avgSchoolRating,0.09996
25,numerical_transformer__numOfBathrooms,0.071484
24,numerical_transformer__MedianStudentsPerTeacher,0.054042
18,numerical_transformer__garageSpaces,0.043405
21,numerical_transformer__numOfPatioAndPorchFeatures,0.040984
26,numerical_transformer__numOfBedrooms,0.040766


In [14]:
test.iloc[0]

uid                                                                        8481
city                                                                     austin
description                   Spectacular Property in the highly sought-afte...
homeType                                                          Single Family
latitude                                                              30.413195
longitude                                                            -97.765175
garageSpaces                                                                  0
hasSpa                                                                    False
yearBuilt                                                                  1990
numOfPatioAndPorchFeatures                                                    0
lotSizeSqFt                                                             11325.6
avgSchoolRating                                                             7.0
MedianStudentsPerTeacher                

In [15]:
{
  "uid": 8481,
  "city": 'austin',
  "description": 'Spectacular Property in the highly sought-after Arboretum Area. Beautiful pool & spa w/ tree-top views & fully screened-in porch. Two story entry & multiple living areas, including oversized playroom/flex space & home theatre system. Upstairs features a dramatic master suite w/ spa bathroom & extended walk-in closet.  Super soaker tub, custom walk-in shower w/numerous heads and chandelier. Equipped with Home Automation System. Garage has built-in shelving & elevated storage. Numerous recent upgrades.',
  "homeType": 'Single Family',
  "latitude": 30.41319465637207,
  "longitude": -97.76517486572266,
  "garage_spaces": 0,
  "has_spa": false,
  "year_built": 1990,
  "num_of_patios_and_porch": 0,
  "lot_size_sq_ft": 11325.6,
  "avg_school_rating": 7.0,
  "median_students_per_teacher": 17,
  "num_of_bathrooms": 3,
  "num_of_bedrooms": 4,
}

NameError: name 'false' is not defined