In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split



In [2]:
# Load the data
train_data = pd.read_csv(r"D:\kaggle\house prediction\data.csv")
test_data = pd.read_csv(r"D:\kaggle\house prediction\output.csv")




In [3]:
# Select the features and target variable
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
            'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 
            'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country']



In [4]:
X = train_data[features]
y = train_data['price']

# Encode categorical features
X = pd.get_dummies(X, columns=['street', 'city', 'statezip', 'country'], drop_first=True)



In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)



In [6]:
# Prepare test data
X_test_data = test_data[features]
X_test_data = pd.get_dummies(X_test_data, columns=['street', 'city', 'statezip', 'country'], drop_first=True)

# Align the test data columns with the training data columns
X_test_data = X_test_data.reindex(columns=X.columns, fill_value=0)

# Predict prices for the test data
test_data['predicted_price'] = dt_model.predict(X_test_data)

# Save the test DataFrame with predictions to a new CSV file
test_data.to_csv(r'D:\kaggle\house prediction\predicted_output_dt.csv', index=False)



In [7]:
# Calculate metrics
test_mae = mean_absolute_error(test_data['price'], test_data['predicted_price'])
test_mse = mean_squared_error(test_data['price'], test_data['predicted_price'])
test_rmse = test_mse ** 0.5
test_r2 = r2_score(test_data['price'], test_data['predicted_price'])

# Print metrics
print(f"Decision Tree Regressor Model Test Accuracy:")
print(f"Test Mean Absolute Error (MAE): {test_mae}")
print(f"Test Mean Squared Error (MSE): {test_mse}")
print(f"Test Root Mean Squared Error (RMSE): {test_rmse}")
print(f"Test R-squared (R²): {test_r2}")



Decision Tree Regressor Model Test Accuracy:
Test Mean Absolute Error (MAE): 37313.69638963456
Test Mean Squared Error (MSE): 198830223860.35092
Test Root Mean Squared Error (RMSE): 445903.82803957956
Test R-squared (R²): 0.37443388037411174


In [8]:
# Print the first few rows of predictions
print(test_data[['date', 'predicted_price']].head())

                  date  predicted_price
0  2014-05-02 00:00:00         313000.0
1  2014-05-02 00:00:00        2384000.0
2  2014-05-02 00:00:00         342000.0
3  2014-05-02 00:00:00         420000.0
4  2014-05-02 00:00:00         550000.0
