In [53]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR



In [54]:
df = pd.read_csv(r"E:\ny_housing_project\notebooks\price_house_clean.csv")
df.head()

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BEDS,BATH,FORMATTED_ADDRESS,LATITUDE,LONGITUDE,PROPERTYM2,BOROUGH,price_outlier,BEDS_GROUP,BATH_GROUP,NEEDS_MANUAL_CHECK,PRICE_PER_SQM,Price_Segment
0,douglas elliman,condo,315000,2,2,"Regis Residence, 2 E 55th St #803, New York, N...",40.761255,-73.974483,130.0642,Manhattan,False,2,2,True,2421.880887,Low
1,serhant,condo,195000000,7,10,"217 W 57th St, New York, NY 10019, USA",40.766393,-73.980991,1629.983135,Manhattan,True,7,10,True,119633.13964,High
2,sowae corp,house,260000,4,2,"620 Sinclair Ave, Staten Island, NY 10312, USA",40.541805,-74.196109,187.199545,Staten Island,False,4,2,True,1388.892265,Low
3,compass,condo,69000,3,1,"2 E 55th St, New York, NY 10022, USA",40.761398,-73.974613,41.341835,Manhattan,False,3,1,True,1669.011547,Low
4,sotheby's,townhouse,55000000,7,2,"5 E 64th St, New York, NY 10065, USA",40.767224,-73.969856,1316.900025,Manhattan,True,7,2,True,41764.749758,High


In [55]:
lat0, lon0 = 40.7580, -73.9855
R = 6371  

def haversine(lat1, lon1, lat2, lon2):
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)

    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlambda/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

df["DIST_CENTER"] = haversine(df["LATITUDE"], df["LONGITUDE"], lat0, lon0)
df["DIST_CENTER"].describe()

count    4231.000000
mean       12.129578
std         7.189177
min         0.260374
25%         5.707670
50%        12.745015
75%        16.953205
max        36.130012
Name: DIST_CENTER, dtype: float64

In [56]:
df1 = df.drop(columns =['Price_Segment','NEEDS_MANUAL_CHECK','BATH_GROUP','BEDS_GROUP','BROKERTITLE','PRICE_PER_SQM','FORMATTED_ADDRESS'])
df1

Unnamed: 0,TYPE,PRICE,BEDS,BATH,LATITUDE,LONGITUDE,PROPERTYM2,BOROUGH,price_outlier,DIST_CENTER
0,condo,315000,2,2,40.761255,-73.974483,130.064200,Manhattan,False,0.995968
1,condo,195000000,7,10,40.766393,-73.980991,1629.983135,Manhattan,True,1.007620
2,house,260000,4,2,40.541805,-74.196109,187.199545,Staten Island,False,29.893211
3,condo,69000,3,1,40.761398,-73.974613,41.341835,Manhattan,False,0.991766
4,townhouse,55000000,7,2,40.767224,-73.969856,1316.900025,Manhattan,True,1.669674
...,...,...,...,...,...,...,...,...,...,...
4226,multi family home,2300000,3,3,40.598830,-73.985964,218.786565,Manhattan,False,17.698962
4227,multi family home,799000,6,2,40.902780,-73.854059,185.806000,Bronx,False,19.531267
4228,multi family home,1380000,12,3,40.712448,-73.800851,249.072943,Queens,False,16.361568
4229,multi family home,1700000,3,7,40.651222,-74.005780,729.660162,Brooklyn,False,11.995599


In [57]:
df['PRICE'] = np.log(df['PRICE'])

In [58]:
df_outlier= df.query("price_outlier == True")
df_normal = df.query("price_outlier == False")

df_outlier.describe()

Unnamed: 0,PRICE,BEDS,BATH,LATITUDE,LONGITUDE,PROPERTYM2,BEDS_GROUP,BATH_GROUP,PRICE_PER_SQM,DIST_CENTER
count,488.0,488.0,488.0,488.0,488.0,488.0,488.0,488.0,488.0,488.0
mean,15.814309,5.153689,4.442623,40.745275,-73.976363,440.939266,4.620902,4.161885,29134.925577,4.806505
std,0.698448,4.829638,3.813516,0.048809,0.037969,523.447179,2.073033,2.199594,29785.845854,4.375019
min,14.94659,1.0,0.0,40.525422,-74.166603,119.194549,1.0,0.0,1134.20132,0.261531
25%,15.224968,3.0,2.0,40.721347,-73.997818,202.919463,3.0,2.0,15672.362335,2.171959
50%,15.655663,4.0,4.0,40.751647,-73.978614,282.889635,4.0,4.0,21327.318296,3.379883
75%,16.230862,6.0,5.0,40.775161,-73.963525,485.72011,6.0,5.0,32037.091266,5.272349
max,19.08851,50.0,50.0,40.898508,-73.740176,6088.398105,10.0,10.0,295683.810276,30.038445


In [59]:
df_normal.describe()

Unnamed: 0,PRICE,BEDS,BATH,LATITUDE,LONGITUDE,PROPERTYM2,BEDS_GROUP,BATH_GROUP,PRICE_PER_SQM,DIST_CENTER
count,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0,3743.0
mean,13.48901,3.143735,2.093775,40.713224,-73.932936,172.450591,3.091638,2.080417,5861.193476,13.084337
std,0.715892,2.164079,1.436718,0.090065,0.102478,98.436547,1.91408,1.284783,3900.992179,6.930222
min,10.809728,1.0,1.0,40.499798,-74.253033,23.22575,1.0,1.0,369.604763,0.260374
25%,13.014778,2.0,1.0,40.634777,-73.981494,102.1933,2.0,1.0,3326.442866,8.212134
50%,13.526494,3.0,2.0,40.724424,-73.936992,200.67048,3.0,2.0,4858.208218,13.708729
75%,13.997832,4.0,2.0,40.772306,-73.858593,202.919463,4.0,2.0,7365.239752,17.393143
max,14.914123,24.0,32.0,40.912729,-73.70245,1950.963,10.0,10.0,32035.461462,36.130012


In [60]:
numeric_features = ['BEDS', 'BATH', 'PROPERTYM2','DIST_CENTER']
categorical_features = ['TYPE', 'BOROUGH']

In [61]:


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])
pipelines = {
    "Linear": Pipeline([("preprocessor", preprocessor), ("regressor", LinearRegression())]),
    "Lasso": Pipeline([("preprocessor", preprocessor), ("regressor", Lasso(alpha=0.1))]),
    "Ridge": Pipeline([("preprocessor", preprocessor), ("regressor", Ridge(alpha=1.0))]),
    "RandomForest": Pipeline([("preprocessor", preprocessor), ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))]),
    "SVR": Pipeline([("preprocessor", preprocessor), ("regressor", SVR(kernel='rbf'))])
}



In [62]:
X_normal = df_normal[numeric_features + categorical_features]
y_normal = df_normal['PRICE']

Xn_train, Xn_test, yn_train, yn_test = train_test_split( X_normal, y_normal, test_size=0.2, random_state=42)


meo = []

for name, pipe in pipelines.items():
    pipe.fit(Xn_train, yn_train)     # Train model
    y_pred_train = pipe.predict(Xn_train)     # Dự đoán
    y_pred_test = pipe.predict(Xn_test)

    rmse_train = np.sqrt(mean_squared_error(yn_train, y_pred_train)) # Tính metric cho train
    r2_train = r2_score(yn_train, y_pred_train)
    
    rmse_test = np.sqrt(mean_squared_error(yn_test, y_pred_test))     # Tính metric cho test
    r2_test = r2_score(yn_test, y_pred_test)
    
    meo.append((name, rmse_train, r2_train, rmse_test, r2_test))

# In kết quả
for name, rmse_tr, r2_tr, rmse_te, r2_te in meo:
    print(f"{name:<15} Train RMSE: {rmse_tr:.3f}, R²: {r2_tr:.3f} | Test RMSE: {rmse_te:.3f}, R²: {r2_te:.3f}")


Linear          Train RMSE: 0.468, R²: 0.576 | Test RMSE: 0.460, R²: 0.575
Lasso           Train RMSE: 0.599, R²: 0.304 | Test RMSE: 0.581, R²: 0.320
Ridge           Train RMSE: 0.468, R²: 0.576 | Test RMSE: 0.459, R²: 0.575
RandomForest    Train RMSE: 0.137, R²: 0.964 | Test RMSE: 0.355, R²: 0.746
SVR             Train RMSE: 0.341, R²: 0.775 | Test RMSE: 0.371, R²: 0.723


In [None]:
best_model = pipelines["RandomForest"]
best_model.fit(Xn_train, yn_train)

y_pred_test = best_model.predict(Xn_test)
y_pred_train = best_model.predict(Xn_train)

rmse_test = np.sqrt(mean_squared_error(yn_test, y_pred_test))
r2_test   = r2_score(yn_test, y_pred_test)

rmse_train = np.sqrt(mean_squared_error(yn_train, y_pred_train))
r2_train   = r2_score(yn_train, y_pred_train)

print(f"RandomForest: Train RMSE={rmse_train:.3f}, R²={r2_train:.3f}")
print(f"RandomForest: Test  RMSE={rmse_test:.3f}, R²={r2_test:.3f}")



RandomForest: Train RMSE=0.137, R²=0.964
RandomForest: Test  RMSE=0.355, R²=0.746


In [71]:
data = {
    'BEDS': int(input("Số phòng ngủ: ")),
    'BATH': int(input("Số phòng tắm: ")),
    'PROPERTYM2': float(input("Diện tích (m2): ")),
    'DIST_CENTER': float(input("Khoảng cách đến trung tâm (km): ")),
    'TYPE': input("Loại nhà:  "),
    'BOROUGH': input("Khu vực:  ")
}
  
new_data = pd.DataFrame([data])
predicted_price = best_model.predict(new_data)

print(f"Giá nhà dự đoán: {predicted_price[0]:,.2f}")


ValueError: Found unknown categories ['House'] in column 0 during transform

### Borough

In [64]:
# results_by_borough = {}

# for borough, df_sub in df_normal.groupby("BOROUGH"):
#     print(f"\n===== {borough} =====")
#     X_sub = df_sub[numeric_features + categorical_features]
#     y_sub = df_sub['PRICE']
    
#     X_train, X_test, y_train, y_test = train_test_split(
#         X_sub, y_sub, test_size=0.2, random_state=42
#     )
    
#     results = []
#     for name, pipe in pipelines.items():
#         pipe.fit(X_train, y_train)
#         y_pred = pipe.predict(X_test)
#         rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#         r2 = r2_score(y_test, y_pred)
#         results.append((name, rmse, r2))
#         print(f"{name:<15} RMSE: {rmse:.3f}, R²: {r2:.3f}")
    
#     results_by_borough[borough] = results


### luxury

In [65]:
X_normal = df[numeric_features + categorical_features]
y_normal = np.log(df['PRICE'])

Xn_train, Xn_test, yn_train, yn_test = train_test_split( X_normal, y_normal, test_size=0.2, random_state=42)

results = []

for name, pipe in pipelines.items():
    pipe.fit(Xn_train, yn_train)
    y_pred = pipe.predict(Xn_test)
    rmse = np.sqrt(mean_squared_error(yn_test, y_pred))
    r2 = r2_score(yn_test, y_pred)
    results.append((name, rmse, r2))

for name, rmse, r2 in results:
    print(f"{name:<15} RMSE: {rmse:.3f}, R²: {r2:.3f}")


Linear          RMSE: 0.048, R²: 0.600
Lasso           RMSE: 0.076, R²: -0.000
Ridge           RMSE: 0.048, R²: 0.601
RandomForest    RMSE: 0.034, R²: 0.795
SVR             RMSE: 0.045, R²: 0.646


In [66]:
numeric_features = [ 'PROPERTYM2','DIST_CENTER']
categorical_features = ['BEDS', 'BATH','TYPE', 'BOROUGH']

In [67]:
X_normal = df_normal[numeric_features + categorical_features]
y_normal = df_normal['PRICE']

Xn_train, Xn_test, yn_train, yn_test = train_test_split( X_normal, y_normal, test_size=0.2, random_state=42)


results = []

for name, pipe in pipelines.items():
    pipe.fit(Xn_train, yn_train)     # Train model
    y_pred_train = pipe.predict(Xn_train)     # Dự đoán
    y_pred_test = pipe.predict(Xn_test)

    rmse_train = np.sqrt(mean_squared_error(yn_train, y_pred_train)) # Tính metric
    r2_train = r2_score(yn_train, y_pred_train)
    
    rmse_test = np.sqrt(mean_squared_error(yn_test, y_pred_test))
    r2_test = r2_score(yn_test, y_pred_test)
    
    results.append((name, rmse_train, r2_train, rmse_test, r2_test))

# In kết quả
for name, rmse_tr, r2_tr, rmse_te, r2_te in results:
    print(f"{name:<15} Train RMSE: {rmse_tr:.3f}, R²: {r2_tr:.3f} | Test RMSE: {rmse_te:.3f}, R²: {r2_te:.3f}")


Linear          Train RMSE: 0.468, R²: 0.576 | Test RMSE: 0.460, R²: 0.575
Lasso           Train RMSE: 0.599, R²: 0.304 | Test RMSE: 0.581, R²: 0.320
Ridge           Train RMSE: 0.468, R²: 0.576 | Test RMSE: 0.459, R²: 0.575
RandomForest    Train RMSE: 0.137, R²: 0.964 | Test RMSE: 0.355, R²: 0.746
SVR             Train RMSE: 0.341, R²: 0.775 | Test RMSE: 0.371, R²: 0.723


In [68]:
rf_model = pipelines["RandomForest"]
rf_model.fit(Xn_train, yn_train)

y_pred = rf_model.predict(Xn_test)
rmse = np.sqrt(mean_squared_error(yn_test, y_pred))
r2   = r2_score(yn_test, y_pred)

print(f"RandomForest: RMSE={rmse:.3f}, R²={r2:.3f}")


RandomForest: RMSE=0.355, R²=0.746
