In [13]:
spark.stop()

In [5]:
# Notebook phân tích & xây dựng mapping cho dữ liệu bất động sản
import os
import sys
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import (
    col, to_timestamp, current_timestamp, lit, regexp_replace, trim,
    when, upper, lower, split, element_at, round as spark_round,
    avg, count, percentile_approx, stddev, min as spark_min, max as spark_max,
    udf, length, expr
)
from pyspark.sql.types import StringType, DoubleType, BooleanType
from pyspark.sql.window import Window

# Thêm thư mục gốc vào sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

# Tạo Spark Session
spark = SparkSession.builder \
    .appName("BatDongSan Mapping Analysis") \
    .config("spark.ui.port", "4050") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

print("Spark session created successfully")

Spark session created successfully


In [9]:
parquet_file = "/data/realestate/processed/ml/feature_store/house/2025/06/12/features_house_20250612.parquet"
df = spark.read.parquet(parquet_file)

# df_filtered = df.select(
#     'location', 
#     'street', 
#     'street_id', 
#     'ward', 
#     'ward_id', 
#     'district', 
#     'district_id', 
#     'province', 
#     'province_id'
# )

# df_filtered.show(100, truncate=False)
df.printSchema()


root
 |-- province_id: long (nullable = true)
 |-- price: double (nullable = true)
 |-- area: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- bedroom: double (nullable = true)
 |-- bathroom: double (nullable = true)
 |-- floor_count: double (nullable = true)
 |-- house_direction_code: integer (nullable = true)
 |-- legal_status_code: integer (nullable = true)
 |-- interior_code: integer (nullable = true)
 |-- district_id: long (nullable = true)
 |-- ward_id: long (nullable = true)
 |-- id: string (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- area_per_room: double (nullable = true)
 |-- bedroom_bathroom_ratio: double (nullable = true)
 |-- population_density: double (nullable = true)



In [4]:
from pyspark.sql.functions import col
from pyspark.sql.types import TimestampType, DateType

# Chuyển cột dạng Date và Timestamp sang String
for name, dtype in df.dtypes:
    if dtype in ('timestamp', 'date'):
        df = df.withColumn(name, col(name).cast("string"))

df.printSchema()
df.count()

root
 |-- area: double (nullable = true)
 |-- bathroom: double (nullable = true)
 |-- bedroom: double (nullable = true)
 |-- crawl_timestamp: string (nullable = true)
 |-- data_type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- floor_count: double (nullable = true)
 |-- house_direction: string (nullable = true)
 |-- house_type: string (nullable = true)
 |-- interior: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- legal_status: string (nullable = true)
 |-- length: double (nullable = true)
 |-- living_size: double (nullable = true)
 |-- location: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- posted_date: string (nullable = true)
 |-- price: double (nullable = true)
 |-- price_per_m2: double (nullable = true)
 |-- seller_info: string (nullable = true)
 |-- source: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- width: double (nullable = true)
 |-- processing_d

5000

In [10]:
import os

output_dir = "/tmp/house_data_csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

csv_output_path = f"{output_dir}/house_data.csv"

# Lưu DataFrame dưới dạng file CSV
df.toPandas().to_csv(csv_output_path, index=False)

# Kiểm tra file
print(f"File CSV đã được lưu tại: {csv_output_path}")
print(f"Kích thước file: {os.path.getsize(csv_output_path) / (1024*1024):.2f} MB")


File CSV đã được lưu tại: /tmp/house_data_csv/house_data.csv
Kích thước file: 3.74 MB


In [11]:
from IPython.display import HTML
import base64

def create_download_link(file_path, filename=None):
    if filename is None:
        filename = os.path.basename(file_path)

    with open(file_path, "rb") as f:
        data = f.read()

    b64 = base64.b64encode(data).decode()
    href = f'<a download="{filename}" href="data:text/csv;base64,{b64}" target="_blank">Tải xuống {filename}</a>'
    return HTML(href)

# Hiển thị link tải xuống
create_download_link(csv_output_path, "train_data.csv")

In [8]:
spark.stop()