In [62]:
spark.stop()

In [66]:
# Notebook ph√¢n t√≠ch & x√¢y d·ª±ng mapping cho d·ªØ li·ªáu b·∫•t ƒë·ªông s·∫£n
import os
import sys
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import (
    col, to_timestamp, current_timestamp, lit, regexp_replace, trim,
    when, upper, lower, split, element_at, round as spark_round,
    avg, count, percentile_approx, stddev, min as spark_min, max as spark_max,
    udf, length, expr
)
from pyspark.sql.types import StringType, DoubleType, BooleanType
from pyspark.sql.window import Window

# Th√™m th∆∞ m·ª•c g·ªëc v√†o sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

# T·∫°o Spark Session
spark = SparkSession.builder \
    .appName("BatDongSan Mapping Analysis") \
    .config("spark.ui.port", "4050") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

print("Spark session created successfully")

Spark session created successfully


In [85]:
parquet_file = "/data/realestate/processed/silver/batdongsan/house/2025/06/12/*.parquet"
df = spark.read.parquet(parquet_file)

df_filtered = df.select(
    'location', 
    'street', 
    'street_id', 
    'ward', 
    'ward_id', 
    'district', 
    'district_id', 
    'province', 
    'province_id'
)

df_filtered.show(100, truncate=False)
df.printSchema()


+------------------------------------------------------------------------------+---------------------------------+---------+--------------------------+-------+-----------------------+-----------+--------------+-----------+
|location                                                                      |street                           |street_id|ward                      |ward_id|district               |district_id|province      |province_id|
+------------------------------------------------------------------------------+---------------------------------+---------+--------------------------+-------+-----------------------+-----------+--------------+-----------+
|S·ªë 7, Ph·ªë H√†ng Gi·∫•y, Ph∆∞·ªùng ƒê√¥ÃÄng Xu√¢n, Ho√†n Ki·∫øm, H√† N·ªôi                     |S·ªë 7                             |-1       |Ph·ªë H√†ng Gi·∫•y             |-1     |Ph∆∞·ªùng ƒê√¥ÃÄng Xu√¢n      |-1         |Ho√†n Ki·∫øm     |-1         |
|S·ªë 435+437, Ph·ªë V≈© H·ªØu, Ph∆∞·ªùng Thanh Xu√¢n B·∫Øc, Thanh Xu√¢

In [93]:
# Th·ªëng k√™ c∆° b·∫£n v·ªÅ c√°c c·ªôt s·ªë
df.describe().show()

# Ki·ªÉm tra ph√¢n ph·ªëi gi√° tr·ªã c·ªßa c√°c c·ªôt d·∫°ng chu·ªói (ƒë·∫øm t·∫ßn su·∫•t c√°c gi√° tr·ªã)
from pyspark.sql.functions import col

# L·∫∑p qua c√°c c·ªôt ƒë·ªÉ ki·ªÉm tra ph√¢n ph·ªëi gi√° tr·ªã c·ªßa c√°c c·ªôt d·∫°ng chu·ªói
for column in df.columns:
    if dict(df.dtypes)[column] == 'string':  # Ki·ªÉm tra xem c·ªôt c√≥ ph·∫£i l√† ki·ªÉu string kh√¥ng
        print(f"Ph√¢n ph·ªëi gi√° tr·ªã cho c·ªôt: {column}")
        df.groupBy(column).count().show()


                                                                                

+-------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+---------+---------+---------+----------------+------------------+------------------+--------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+--------------------+-------------+-------------------------+-------------------+-------------+
|summary|                  id|                 url|    source|               title|         description|            location|data_type| province| district|            ward|          latitude|         longitude|               price|              area|       price_per_m2|           bedroom|          bathroom|      floor_count|  house_direction|      legal_status|          interior|        

In [94]:
# Ki·ªÉm tra c√°c gi√° tr·ªã min v√† max cho c√°c c·ªôt s·ªë
for column in df.columns:
    if dict(df.dtypes)[column] in ['double', 'int']:  # Ki·ªÉm tra ch·ªâ v·ªõi c√°c c·ªôt s·ªë
        min_value = df.selectExpr(f"min({column})").collect()[0][0]
        max_value = df.selectExpr(f"max({column})").collect()[0][0]
        print(f"C·ªôt: {column}")
        print(f"  Min: {min_value}")
        print(f"  Max: {max_value}")
        print("-" * 50)


C·ªôt: latitude
  Min: 9.156794
  Max: 21.4526750773141
--------------------------------------------------
C·ªôt: longitude
  Min: 103.9028
  Max: 109.372375
--------------------------------------------------
C·ªôt: price
  Min: 2100000.0
  Max: 2100000000000.0
--------------------------------------------------
C·ªôt: area
  Min: 10.0
  Max: 30000.0
--------------------------------------------------
C·ªôt: price_per_m2
  Min: 100.0
  Max: 317780000000.0
--------------------------------------------------
C·ªôt: bedroom
  Min: 1.0
  Max: 155.0
--------------------------------------------------
C·ªôt: bathroom
  Min: 1.0
  Max: 100.0
--------------------------------------------------
C·ªôt: floor_count
  Min: 1.0
  Max: 77.0
--------------------------------------------------
C·ªôt: width
  Min: 1.0
  Max: 43000.0
--------------------------------------------------
C·ªôt: length
  Min: 1.0
  Max: 14000.0
--------------------------------------------------
C·ªôt: living_size
  Min: 2.0
  Max:

In [103]:
# L·ªçc ra c·ªôt url v√† price
price_url_df = df.select("url", "price", "area","price_per_m2")

# Truy xu·∫•t c√°c b·∫£n ghi c√≥ gi√° tr·ªã price b·∫±ng 1 (c√≥ th·ªÉ l√† gi√° tr·ªã m·∫∑c ƒë·ªãnh ho·∫∑c l·ªói)
low_price_url_df = price_url_df.filter(col("price") < 50_000_000)
low_price_url_df.show(5, truncate=False)

# Truy xu·∫•t c√°c b·∫£n ghi c√≥ gi√° tr·ªã price qu√° cao (> 200 t·ª∑)
high_price_url_df = price_url_df.filter(col("price") > 1_000_000_000_000)

high_price_url_df.show(5, truncate=False)
high_price_url_df.count()


+------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-----+-----------------+
|url                                                                                                                                                         |price    |area |price_per_m2     |
+------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-----+-----------------+
|https://nha.chotot.com/125444794.htm                                                                                                                        |4.3E7    |98.0 |438775.5         |
|https://nha.chotot.com/124779035.htm                                                                                                                        |2100000.0|45.0 |46666.667        |
|https://nha.chotot.com/125411718.h

6

In [56]:
from pyspark.sql.functions import col, count, when

# T√≠nh t·ªïng s·ªë d√≤ng trong DataFrame
total_rows = df.count()

# Ki·ªÉm tra n·∫øu DataFrame c√≥ d·ªØ li·ªáu
if total_rows == 0:
    print("DataFrame r·ªóng, kh√¥ng c√≥ d·ªØ li·ªáu ƒë·ªÉ ki·ªÉm tra.")
else:
    # L·∫•y danh s√°ch c√°c c·ªôt trong DataFrame
    columns = df.columns

    # Ki·ªÉm tra t·ª∑ l·ªá thi·∫øu trong t·ª´ng c·ªôt
    missing_data = {}

    for column in columns:
        missing_count = df.filter(col(column).isNull()).count()
        missing_percentage = (missing_count / total_rows) * 100
        missing_data[column] = {"missing_count": missing_count, "missing_percentage": missing_percentage}

    # Hi·ªÉn th·ªã k·∫øt qu·∫£
    for column, data in missing_data.items():
        print(f"C·ªôt: {column}")
        print(f"  Thi·∫øu: {data['missing_count']} gi√° tr·ªã ({data['missing_percentage']:.2f}%)")
        print("-" * 50)


C·ªôt: price
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: area
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: latitude
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: longitude
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: price_per_m2
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: bedroom
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: bathroom
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: district
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: ward
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: house_direction
  Thi·∫øu: 0 gi√° tr·ªã (0.00%)
--------------------------------------------------
C·ªôt: leg

In [48]:
from pyspark.sql.functions import col
from pyspark.sql.types import TimestampType, DateType

# Chuy·ªÉn c·ªôt d·∫°ng Date v√† Timestamp sang String
for name, dtype in df.dtypes:
    if dtype in ('timestamp', 'date'):
        df = df.withColumn(name, col(name).cast("string"))

df.printSchema()


root
 |-- price: double (nullable = true)
 |-- area: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- price_per_m2: double (nullable = true)
 |-- bedroom: double (nullable = true)
 |-- bathroom: double (nullable = true)
 |-- district: string (nullable = true)
 |-- ward: string (nullable = true)
 |-- house_direction: string (nullable = true)
 |-- legal_status: string (nullable = true)
 |-- interior: string (nullable = true)
 |-- house_type: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- data_quality_score: double (nullable = true)
 |-- id: string (nullable = true)
 |-- data_date: string (nullable = true)
 |-- bedroom_bathroom_ratio: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- area_per_room: double (nullable = true)
 |-- area_category: string (nullable = true)
 |-- price_vs_market: integer (nullable = true)
 |-- distance_to_center: double (nullable = true)
 |-- location_

In [51]:
from pyspark.sql.functions import col, count, when

# T√≠nh t·ª∑ l·ªá ph·∫ßn trƒÉm c·ªßa c√°c gi√° tr·ªã thi·∫øu cho m·ªói c·ªôt
total_rows = df.count()

# L·∫•y danh s√°ch c√°c c·ªôt trong DataFrame
columns = df.columns

# Ki·ªÉm tra t·ª∑ l·ªá thi·∫øu trong t·ª´ng c·ªôt
missing_data = {}

for column in columns:
    missing_count = df.filter(col(column).isNull()).count()
    missing_percentage = (missing_count / total_rows) * 100
    missing_data[column] = {"missing_count": missing_count, "missing_percentage": missing_percentage}

# Hi·ªÉn th·ªã k·∫øt qu·∫£
for column, data in missing_data.items():
    print(f"C·ªôt: {column}")
    print(f"  Thi·∫øu: {data['missing_count']} gi√° tr·ªã ({data['missing_percentage']:.2f}%)")
    print("-" * 50)


ZeroDivisionError: division by zero

In [41]:
from pyspark.sql.functions import col
from pyspark.sql.types import TimestampType, DateType

# Chuy·ªÉn c·ªôt d·∫°ng Date v√† Timestamp sang String
for name, dtype in df.dtypes:
    if dtype in ('timestamp', 'date'):
        df = df.withColumn(name, col(name).cast("string"))

df.printSchema()
df.count()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- source: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- location: string (nullable = true)
 |-- data_type: string (nullable = true)
 |-- province: string (nullable = true)
 |-- district: string (nullable = true)
 |-- ward: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- price: double (nullable = true)
 |-- area: double (nullable = true)
 |-- price_per_m2: double (nullable = true)
 |-- bedroom: double (nullable = true)
 |-- bathroom: double (nullable = true)
 |-- floor_count: double (nullable = true)
 |-- house_direction: string (nullable = true)
 |-- legal_status: string (nullable = true)
 |-- interior: string (nullable = true)
 |-- house_type: string (nullable = true)
 |-- width: double (nullable = true)
 |-- length: double (nullable = true)
 |-- living_size: double (nullable = true)


15344

In [34]:
import os

output_dir = "/tmp/house_data_csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

csv_output_path = f"{output_dir}/house_data.csv"

# L∆∞u DataFrame d∆∞·ªõi d·∫°ng file CSV
df.toPandas().to_csv(csv_output_path, index=False)

# Ki·ªÉm tra file
print(f"File CSV ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: {csv_output_path}")
print(f"K√≠ch th∆∞·ªõc file: {os.path.getsize(csv_output_path) / (1024*1024):.2f} MB")


                                                                                

File CSV ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: /tmp/house_data_csv/house_data.csv
K√≠ch th∆∞·ªõc file: 84.88 MB


In [36]:
from IPython.display import HTML
import base64

def create_download_link(file_path, filename=None):
    if filename is None:
        filename = os.path.basename(file_path)

    with open(file_path, "rb") as f:
        data = f.read()

    b64 = base64.b64encode(data).decode()
    href = f'<a download="{filename}" href="data:text/csv;base64,{b64}" target="_blank">T·∫£i xu·ªëng {filename}</a>'
    return HTML(href)

# Hi·ªÉn th·ªã link t·∫£i xu·ªëng
create_download_link(csv_output_path, "house_data.csv")

/tmp/data ƒë√£ ƒë∆∞·ª£c x√≥a.


In [1]:
spark.stop()

NameError: name 'spark' is not defined