In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('./sample_data/usa_real_estate_dataset.csv')

data = data.drop(['prev_sold_date'], axis=1)

categorical_columns = ['status', 'city', 'state']
for column in categorical_columns:
    data[column] = data[column].astype('category').cat.codes

data = data.dropna(subset=['price'])
data = data[np.isfinite(data['price'])]

data = data[(data['price'] >= 2000) ]
data['price'] += 1

scale_factor = 1

data['log_price'] = np.log(data['price']) * scale_factor
features_to_drop = ['price','log_price']

X = data.drop(features_to_drop, axis=1)
y = data['log_price']

for i in X.columns:
    data = data[np.isfinite(data[i])]

features_to_drop = ['price','log_price']
X = data.drop(features_to_drop, axis=1)
y = data['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

r2 = r2_score(y_test,y_pred)
print(f'r2 Score: {r2}')
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
plt.scatter(y_test,y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title('Actual vs Predicted')

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [10,15,17,20],
    'min_samples_split': [120,130, 200, 300],
    'min_samples_leaf': [120,130, 200, 300]
}


grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='r2')

grid_search.fit(X_train, y_train)

cv_results = grid_search.cv_results_


fig, axs = plt.subplots(1, 3, figsize=(15, 5))


axs[0].plot(param_grid['max_depth'], cv_results['mean_test_score'][::len(param_grid['min_samples_split']) * len(param_grid['min_samples_leaf'])], marker='o')
axs[0].set_xlabel('Max Depth')
axs[0].set_ylabel('Mean Test Score (Negative MSE)')
axs[0].set_title('Effect of Max Depth')

axs[1].plot(param_grid['min_samples_split'], cv_results['mean_test_score'][:len(param_grid['min_samples_split'])], marker='o')
axs[1].set_xlabel('Min Samples Split')
axs[1].set_title('Effect of Min Samples Split')

axs[2].plot(param_grid['min_samples_leaf'], cv_results['mean_test_score'][0:len(param_grid['min_samples_leaf'])], marker='o')
axs[2].set_xlabel('Min Samples Leaf')
axs[2].set_title('Effect of Min Samples Leaf')

plt.tight_layout()
plt.show()

In [None]:
best_model = grid_search.best_estimator_

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score (r2): {grid_search.best_score_:.2f}')

In [None]:
ccp_path = best_model.cost_complexity_pruning_path(X_train, y_train)
alphas, impurities = ccp_path['ccp_alphas'], ccp_path['impurities']

trees = []
for alpha in alphas:
    tree = DecisionTreeRegressor(random_state=42, ccp_alpha=alpha)
    tree.fit(X_train, y_train)
    trees.append(tree)
errors = []
for tree in trees:
    y_pred = tree.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    errors.append(mse)

In [None]:
tree = DecisionTreeRegressor(**best_params_, random_state=42)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
r2 = r2_score(y_test,y_pred)
print(f'r2 Score: {r2}')
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
plt.scatter(y_test,y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title('Actual vs Predicted')

In [None]:
import matplotlib.pyplot as plt


plt.figure()
plt.plot(alphas, errors)
plt.xlabel('Alpha')
plt.ylabel('Mean Squared Error')
plt.title('Alpha vs. Mean Squared Error')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(best_model, X, y, cv=kf, scoring='neg_mean_squared_error')

In [None]:
mean_squared_error_scores = -scores
print(f'mean_squared_error scores: {mean_squared_error_scores}')
mean_mean_squared_error = np.mean(mean_squared_error_scores)
std_mean_squared_error = np.std(mean_squared_error_scores)

print(f'Mean MSE: {mean_mean_squared_error:.4f}')
print(f'Standard Deviation of MSE: {std_mean_squared_error:.4f}')

mean_squared_error_scores = np.sqrt(mean_squared_error_scores)
print(f'Mean RMSE: {np.mean(mean_squared_error_scores):.4f}')
print(f'Standard Deviation of RMSE: {np.std(mean_squared_error_scores):.4f}')


scores = cross_val_score(best_model, X, y, cv=kf, scoring='r2')

r2_scores = scores
print(f'R2 scores: {r2_scores}')
mean_r2 = np.mean(r2_scores)
std_r2 = np.std(r2_scores)

print(f'Mean r2: {mean_r2:.2f}')
print(f'Standard Deviation of r2: {std_r2:.4f}')

In [None]:
import os
import pandas as pd

folder_path = "sample_data/added_data"

type_zipcode_counts = pd.DataFrame(columns=['Zipcode'])

for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)

        data = pd.read_csv(file_path)

        data = data.dropna(subset=['zipcode'])

        file_type = data['type'].unique()[0]

        counts = data.groupby('zipcode').size()

        counts.name = file_type

        type_zipcode_counts = pd.merge(type_zipcode_counts, counts, left_on='Zipcode', right_index=True, how='outer')


type_zipcode_counts.fillna(0, inplace=True)
type_zipcode_counts.rename(columns={'Zipcode': 'zipcode'}, inplace=True)

type_zipcode_counts.to_csv("./sample_data/added_data/type_zipcode_counts.csv", index=False)

florida_data = pd.read_csv("./sample_data/usa_real_estate_data.csv")
merged_data = pd.merge(florida_data, type_zipcode_counts, on='zipcode', how='left')
merged_data.to_csv("sample_data/merged.csv", index=False)

In [None]:
categorical_columns = ['status', 'city', 'state']
for column in categorical_columns:
    merged_data[column] = data[column].astype('category').cat.codes

merged_data = merged_data.dropna(subset=['price'])
merged_data = merged_data[np.isfinite(data['price'])]

merged_data = merged_data[(merged_data['price'] >= 2000) ]
merged_data['price'] += 1

scale_factor = 1

merged_data['log_price'] = np.log(merged_data['price']) * scale_factor
features_to_drop = ['price','log_price']

X = merged_data.drop(features_to_drop, axis=1)
y = merged_data['log_price']

for i in X.columns:
    merged_data = merged_data[np.isfinite(data[i])]

features_to_drop = ['price','log_price']
X = merged_data.drop(features_to_drop, axis=1)
y = merged_data['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

r2 = r2_score(y_test,y_pred)
print(f'r2 Score: {r2}')
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    random_state=42
)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

r2 = r2_score(y_test,y_pred)
print(f'r2 Score: {r2}')
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

In [None]:
feature_importances = rf_model.feature_importances_
feature_names = X.columns
for feature, importance in zip(feature_names, feature_importances):
    print(f'{feature}: {importance}')

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(feature_names, feature_importances)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance using Random Forest')
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
plt.scatter(y_test,y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title('Actual vs Predicted')

In [None]:
rf_trees = rf_model.estimators_
alphas = np.linspace(0, 0.1, num=10)

meaned_mean_squared_error = []

for alpha in alphas:
    neg_mean_squared_error_scores = []
    for tree in rf_trees:
        pruned_tree = DecisionTreeRegressor(random_state=42)
        pruned_tree = tree
        pruned_tree.cost_complexity_pruning_path(X, y)
        pruned_tree.ccp_alpha = alpha
        pruned_tree = pruned_tree.fit(X, y)
        cv_score = cross_val_score(pruned_tree, X, y, cv=5, scoring='neg_mean_squared_error')
        neg_mean_squared_error_scores.append(cv_score.mean())

    meaned_neg_mean_squared_error_score = np.mean(neg_mean_squared_error_scores)
    meaned_mean_squared_error_score = -meaned_neg_mean_squared_error_score
    meaned_mean_squared_error.append(meaned_mean_squared_error_score)

optimal_alpha = alphas[np.argmax(meaned_mean_squared_error)]

print(f'Optimal alpha value: {optimal_alpha:.4f}')


In [None]:
for i, tree in enumerate(rf_trees):
    tree.ccp_alpha = optimal_alpha
    tree.fit(X_train, y_train)
    rf_model.estimators_[i] = tree

In [None]:
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
r2 = r2_score(y_test,y_pred)
print(f'r2 Score: {r2}')
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R2 Score: {r2}')
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')