In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import seaborn as sns

# Load Dataset

In [None]:
df = pd.read_csv('./dataset/housing.csv')

# Data Preprocessing

In [None]:
label_encoder = LabelEncoder()

In [None]:
df['state_number'] = label_encoder.fit_transform(df['state'])
df['region_numer'] = label_encoder.fit_transform(df['region'])
df['type_number'] = label_encoder.fit_transform(df['type'])

In [None]:
df_train = df[['price', 'type_number', 'sqfeet', 'lat', 'long', 'beds', 'baths', 'state_number']]
df_train = df_train[(df_train['baths'] < 8) & (df_train['beds'] < 9) & (df_train['price'] <= 10000)]
df_train.dropna(inplace=True)

In [None]:
X = df_train.drop(['price'], axis=1)
y = df_train['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training

In [None]:
model = GradientBoostingRegressor(n_estimators=500)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
feature_importances = model.feature_importances_

In [None]:
print("\nResults:")
print(f'MSE: {mse}')
print(f'R2: {r2}')

In [None]:
results = pd.DataFrame({'Features': feature_importances,}, index=X.columns)
print(results)

In [None]:
x_axis = range(len(y_test))
plt.figure(figsize=(10,6))
sns.scatterplot(x=x_axis, y=y_test, color='blue' ,label='Test')
sns.scatterplot(x=x_axis, y=y_pred, color='red', label='Predicted')
plt.legend()
plt.show()

In [None]:
joblib.dump(model, 'rent-prediction.pkl')
model = joblib.load('rent-prediction.pkl')

# Model Testing

In [95]:
new_data = {
    'state': ['fl', 'tx'],
    'region': ['jacksonville', 'austin'],
    'type': ['apartment', 'house'],
    'sqfeet': [1200, 2500],
    'lat': [37.7749, 40.7128],
    'long': [-122.4194, -74.0060],
    'beds': [5, 5],
    'baths': [2, 3]
}

In [96]:
new_data_df = pd.DataFrame(new_data)

In [97]:
new_data_df['state_number'] = label_encoder.fit_transform(new_data_df['state'])
new_data_df['region_numer'] = label_encoder.fit_transform(new_data_df['region'])
new_data_df['type_number'] = label_encoder.fit_transform(new_data_df['type'])

In [98]:
new_data_df = new_data_df.drop(['state', 'region', 'type'], axis=1)
X_new = new_data_df[['type_number', 'sqfeet', 'lat', 'long', 'beds', 'baths', 'state_number']]

In [99]:
predictions = model.predict(X_new)
regions = ['Florida', 'Texas']

for region, price in zip(regions, predictions):
    print(f"{region} Rent Price: ${price:,.2f}")

Florida Rent Price: $3,671.16
Texas Rent Price: $2,650.43
