In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('./vietnam_housing_dataset.csv')

def split_address(address):
    parts = address.split(",")
    if len(parts) >= 3:
        ward_commune = parts[-3].strip()  
        district = parts[-2].strip()      
        province = parts[-1].strip()     
        return ward_commune, district, province
    return None, None, None

df[["Ward/Commune", "District", "Province"]] = df["Address"].apply(split_address).apply(pd.Series)

In [6]:
df = df.dropna(subset=["Ward/Commune", "District", "Province"])

In [7]:
def contains_prefix(text):
    """Checks if a string contains 'Xã', 'Phường', or 'Thị trấn'."""
    prefixes = ["Xã ", "Phường ", "Thị trấn "]
    for prefix in prefixes:
        if prefix in text:
            return True
    return False

df= df[df['Ward/Commune'].apply(contains_prefix)]

In [8]:
df['Ward/Commune'] = df['Ward/Commune'].str.replace(r"^(Xã |Phường |Thị trấn )", "", regex=True)

In [9]:
columns_to_drop = ['Address', 'House direction', 'Balcony direction','Furniture state','Legal status']

df = df.drop(columns_to_drop, axis=1)

In [10]:
num_rows = df.shape[0] 
print(f"Số dòng của DataFrame: {num_rows}")

Số dòng của DataFrame: 30019


In [11]:
df.isnull().sum()

Area                0
Frontage        11479
Access Road     13194
Floors           3556
Bedrooms         5095
Bathrooms        7002
Price               0
Ward/Commune        0
District            0
Province            0
dtype: int64

In [12]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [13]:
categorical_columns = df.select_dtypes(include=[object]).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [60]:
file_name = "vietnam_housing_dataset2.csv"
df.to_csv(file_name, index=False, encoding='utf-8-sig')

In [14]:
num_rows = df.shape[0] 
print(f"Số dòng của DataFrame: {num_rows}")

Số dòng của DataFrame: 30019


In [15]:
df.isnull().sum()

Area            0
Frontage        0
Access Road     0
Floors          0
Bedrooms        0
Bathrooms       0
Price           0
Ward/Commune    0
District        0
Province        0
dtype: int64