In [60]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pickle

# Load the dataset
file_path = 'dataset.csv'
dataset = pd.read_csv(file_path)

# Drop unnecessary columns
dataset = dataset.drop(columns=[col for col in dataset.columns if 'Unnamed' in col or col == 'ID'])

# Display the cleaned dataset to confirm the changes
dataset.head()


Unnamed: 0,Price,Square Footage,Bedrooms,Bathrooms,Floors,Garage,Pool,Central Air,Heating Type,Distance to City Center,Crime Rate,Property Tax,Previous Sale Price
0,16394478,5331,4,1,1,1,No,Yes,Central,49.41,Low,29271.4,5938702
1,13615092,3268,3,1,3,0,Yes,Yes,Electric,19.77,Medium,8456.56,1102914
2,2534489,3686,4,4,2,2,Yes,Yes,Gas,36.76,High,9655.51,3797859
3,14886186,3073,3,2,1,1,No,Yes,,34.39,High,14982.71,6814329
4,9928519,3099,3,2,3,1,No,Yes,Central,11.0,Medium,47837.63,15000939


In [61]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Price                    500 non-null    int64  
 1   Square Footage           500 non-null    int64  
 2   Bedrooms                 500 non-null    int64  
 3   Bathrooms                500 non-null    int64  
 4   Floors                   500 non-null    int64  
 5   Garage                   500 non-null    int64  
 6   Pool                     500 non-null    object 
 7   Central Air              500 non-null    object 
 8   Heating Type             377 non-null    object 
 9   Distance to City Center  500 non-null    float64
 10  Crime Rate               500 non-null    object 
 11  Property Tax             500 non-null    float64
 12  Previous Sale Price      500 non-null    int64  
dtypes: float64(2), int64(7), object(4)
memory usage: 50.9+ KB


In [62]:
dataset.isna().sum()


Price                        0
Square Footage               0
Bedrooms                     0
Bathrooms                    0
Floors                       0
Garage                       0
Pool                         0
Central Air                  0
Heating Type               123
Distance to City Center      0
Crime Rate                   0
Property Tax                 0
Previous Sale Price          0
dtype: int64

In [63]:
dataset.dropna(inplace=True)

In [64]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 377 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Price                    377 non-null    int64  
 1   Square Footage           377 non-null    int64  
 2   Bedrooms                 377 non-null    int64  
 3   Bathrooms                377 non-null    int64  
 4   Floors                   377 non-null    int64  
 5   Garage                   377 non-null    int64  
 6   Pool                     377 non-null    object 
 7   Central Air              377 non-null    object 
 8   Heating Type             377 non-null    object 
 9   Distance to City Center  377 non-null    float64
 10  Crime Rate               377 non-null    object 
 11  Property Tax             377 non-null    float64
 12  Previous Sale Price      377 non-null    int64  
dtypes: float64(2), int64(7), object(4)
memory usage: 41.2+ KB


In [65]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
labelencoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
categorical_columns = ['Pool', 'Central Air', 'Heating Type', 'Crime Rate']
for col in categorical_columns:
    dataset[col] = labelencoder.fit_transform(dataset[col])

# Display the cleaned dataset to confirm the changes
dataset.head()

Unnamed: 0,Price,Square Footage,Bedrooms,Bathrooms,Floors,Garage,Pool,Central Air,Heating Type,Distance to City Center,Crime Rate,Property Tax,Previous Sale Price
0,16394478,5331,4,1,1,1,0,1,0,49.41,1,29271.4,5938702
1,13615092,3268,3,1,3,0,1,1,1,19.77,2,8456.56,1102914
2,2534489,3686,4,4,2,2,1,1,2,36.76,0,9655.51,3797859
4,9928519,3099,3,2,3,1,0,1,0,11.0,2,47837.63,15000939
5,10826850,4533,4,2,1,0,1,1,2,12.33,1,36971.32,12109537


In [66]:
scaler = StandardScaler()

# Scale the numerical features
numerical_columns = ['Price', 'Square Footage', 'Bedrooms', 'Bathrooms', 'Floors', 'Garage', 'Distance to City Center', 'Property Tax', 'Previous Sale Price']
dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])

In [67]:
X = dataset.drop('Price', axis=1)
y = dataset['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [69]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

In [70]:
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.8526114378640736
Test Score: -0.02001537488348748


In [73]:
with open('model_house.pkl', 'wb') as file:
    pickle.dump((model, scaler, categorical_columns), file)