In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [47]:
df = pd.read_csv('./vietnam_housing_dataset.csv')
df.head(5)

Unnamed: 0,Address,Area,Frontage,Access Road,House direction,Balcony direction,Floors,Bedrooms,Bathrooms,Legal status,Furniture state,Price
0,"Dự án The Empire - Vinhomes Ocean Park 2, Xã L...",84.0,,,,,4.0,,,Have certificate,,8.6
1,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",60.0,,,,,5.0,,,,,7.5
2,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",90.0,6.0,13.0,Đông - Bắc,Đông - Bắc,5.0,,,Sale contract,,8.9
3,"Đường Nguyễn Văn Khối, Phường 11, Gò Vấp, Hồ C...",54.0,,3.5,Tây - Nam,Tây - Nam,2.0,2.0,3.0,Have certificate,Full,5.35
4,"Đường Quang Trung, Phường 8, Gò Vấp, Hồ Chí Minh",92.0,,,Đông - Nam,Đông - Nam,2.0,4.0,4.0,Have certificate,Full,6.9


In [48]:
def split_address(address):
    parts = address.split(",")
    if len(parts) >= 3:
        ward_commune = parts[-3].strip()  
        district = parts[-2].strip()      
        province = parts[-1].strip()     
        return ward_commune, district, province
    return None, None, None

df[["Ward/Commune", "District", "Province"]] = df["Address"].apply(split_address).apply(pd.Series)


In [13]:
df.head(3)

Unnamed: 0,Address,Area,Frontage,Access Road,House direction,Balcony direction,Floors,Bedrooms,Bathrooms,Legal status,Furniture state,Price,Ward/Commune,District,Province
0,"Dự án The Empire - Vinhomes Ocean Park 2, Xã L...",84.0,,,,,4.0,,,Have certificate,,8.6,Xã Long Hưng,Văn Giang,Hưng Yên
1,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",60.0,,,,,5.0,,,,,7.5,Xã Nghĩa Trụ,Văn Giang,Hưng Yên
2,"Dự án The Crown - Vinhomes Ocean Park 3, Xã Ng...",90.0,6.0,13.0,Đông - Bắc,Đông - Bắc,5.0,,,Sale contract,,8.9,Xã Nghĩa Trụ,Văn Giang,Hưng Yên


In [49]:
df.isnull().sum()

Address                  0
Area                     0
Frontage             11564
Access Road          13297
House direction      21239
Balcony direction    24983
Floors                3603
Bedrooms              5162
Bathrooms             7074
Legal status          4506
Furniture state      14119
Price                    0
Ward/Commune             5
District                 5
Province                 5
dtype: int64

In [50]:
num_rows = df.shape[0]  
print(f"Số dòng của DataFrame: {num_rows}")

Số dòng của DataFrame: 30229


In [51]:
df = df.dropna(subset=["Ward/Commune", "District", "Province"])

In [52]:
num_rows = df.shape[0]  
print(f"Số dòng của DataFrame: {num_rows}")

Số dòng của DataFrame: 30224


In [53]:
def contains_prefix(text):
    """Checks if a string contains 'Xã', 'Phường', or 'Thị trấn'."""
    prefixes = ["Xã ", "Phường ", "Thị trấn "]
    for prefix in prefixes:
        if prefix in text:
            return True
    return False

df= df[df['Ward/Commune'].apply(contains_prefix)]

In [54]:
df['Ward/Commune'] = df['Ward/Commune'].str.replace(r"^(Xã |Phường |Thị trấn )", "", regex=True)

In [55]:
num_rows = df.shape[0] 
print(f"Số dòng của DataFrame: {num_rows}")

Số dòng của DataFrame: 30019


In [59]:
file_name = "vietnam_housing_dataset2.csv"
df.to_csv(file_name, index=False, encoding='utf-8-sig')

In [40]:
num_rows = df.shape[0]  # Số dòng của DataFrame
print(f"Số dòng của DataFrame: {num_rows}")

Số dòng của DataFrame: 30019


In [41]:
df.isnull().sum()

Address                  0
Area                     0
Frontage             11479
Access Road          13194
House direction      21114
Balcony direction    24833
Floors                3556
Bedrooms              5095
Bathrooms             7002
Legal status          4480
Furniture state      13988
Price                    0
Ward/Commune             0
District                 0
Province                 0
dtype: int64

In [56]:
columns_to_drop = ['Address', 'House direction', 'Balcony direction','Furniture state','Legal status']

df = df.drop(columns_to_drop, axis=1)

In [43]:
df = df.dropna()

In [57]:
num_rows = df.shape[0]  
print(f"Số dòng của DataFrame: {num_rows}")

Số dòng của DataFrame: 30019


In [53]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [54]:
categorical_columns = df.select_dtypes(include=[object]).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [58]:
df.head(5)

Unnamed: 0,Area,Frontage,Access Road,Floors,Bedrooms,Bathrooms,Price,Ward/Commune,District,Province
0,84.0,,,4.0,,,8.6,Long Hưng,Văn Giang,Hưng Yên
1,60.0,,,5.0,,,7.5,Nghĩa Trụ,Văn Giang,Hưng Yên
2,90.0,6.0,13.0,5.0,,,8.9,Nghĩa Trụ,Văn Giang,Hưng Yên
3,54.0,,3.5,2.0,2.0,3.0,5.35,11,Gò Vấp,Hồ Chí Minh
4,92.0,,,2.0,4.0,4.0,6.9,8,Gò Vấp,Hồ Chí Minh


In [60]:
file_name = "vietnam_housing_dataset3.csv"
df.to_csv(file_name, index=False, encoding='utf-8-sig')

In [55]:
from sklearn.preprocessing import OneHotEncoder

s = (df.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)
print('No. of. categorical features: ', 
      len(object_cols))

Categorical variables:
['Address', 'House direction', 'Balcony direction', 'Legal status', 'Furniture state', 'Ward/Commune', 'District', 'Province']
No. of. categorical features:  8


In [None]:

from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[object_cols]))
OH_cols.index = df.index
OH_cols.columns = OH_encoder.get_feature_names_out()
df_final = df.drop(object_cols, axis=1)
df_final = pd.concat([df_final, OH_cols], axis=1)

In [49]:
num_rows = df_cleaned.shape[0] 
print(f"Số dòng của DataFrame: {num_rows}")

Số dòng của DataFrame: 2629


In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    df[col].fillna(df[col].median(), inplace=True)

In [None]:
categorical_columns = df.select_dtypes(include=[object]).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
df.isnull().sum()