In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Boston-house-price').getOrCreate()

24/12/13 10:17:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
train = spark.read.csv('data/house_train.csv', header=True, inferSchema=True)
test = spark.read.csv('data/house_test.csv', header=True, inferSchema=True)

                                                                                

In [4]:
train.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [6]:
# 행 수
train.count()

1460

In [5]:
# 열 수 
len(train.columns)

81

## 타입 변환

In [8]:
# GarageArea, GarageCars를 integer로 캐스팅
train = train.withColumn('GarageArea', train['GarageArea'].cast('integer'))
test = test.withColumn('GarageArea', test['GarageArea'].cast('integer'))

In [9]:
train = train.withColumn('GarageCars', train['GarageCars'].cast('integer'))
test = test.withColumn('GarageCars', test['GarageCars'].cast('integer'))

## 결측치 확인

In [11]:
import pyspark.sql.functions as F

null_counts = train.select(
    [F.sum(F.col(column).isNull().cast("int")).alias(column) for column in train.columns]
)

null_counts.show()

24/12/13 10:25:21 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition

## 인코딩

In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

cat_features = ['Neighborhood']
num_features = ["LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", 
                "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageCars", "GarageArea"]

indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in cat_features]
encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded") for col in cat_features]

## feature transformation

In [17]:
assembler_inputs = [col + "_encoded" for col in cat_features] + num_features
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol='features')

In [18]:
train = train.withColumnRenamed('SalePrice','label')

## pipeline modeling

In [24]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

# 파이프라인 생성
pipeline = Pipeline(stages=indexers + encoders + [assembler])
pipeline_model = pipeline.fit(train)

# 전처리된 데이터
train_transformed = pipeline_model.transform(train)

# 모델 학습
lr = LinearRegression(featuresCol='features', labelCol='label')
lr_model = lr.fit(train_transformed)

24/12/13 10:33:50 WARN Instrumentation: [c7b7f10a] regParam is zero, which might cause numerical instability and overfitting.


## predict

In [25]:
predictions = lr_model.transform(train_transformed)
predictions.select('features', 'label', 'prediction').show()

+--------------------+------+------------------+
|            features| label|        prediction|
+--------------------+------+------------------+
|(34,[1,24,25,26,2...|208500|205934.17714923155|
|(34,[22,24,25,26,...|181500|214730.34424817655|
|(34,[1,24,25,26,2...|223500|212120.63802080858|
|(34,[11,24,25,26,...|140000|202392.10550916777|
|(34,[13,24,25,26,...|250000| 318415.7893765026|
|(34,[12,24,25,26,...|143000|147140.20750589063|
|(34,[4,24,25,26,2...|307000|251496.38065852178|
|(34,[8,24,25,26,2...|200000|207228.51457976596|
|(34,[2,24,25,26,2...|129900|163142.76420624624|
|(34,[10,24,25,26,...|118000|120358.99549888307|
|(34,[7,24,25,26,2...|129500| 116349.4631787499|
|(34,[6,24,25,26,2...|345000| 339351.4464731971|
|(34,[7,24,25,26,2...|144000|113793.73722324474|
|(34,[1,24,25,26,2...|279500|226557.77262252616|
|(34,[0,24,25,26,2...|157000|143278.56057959097|
|(34,[10,24,25,26,...|132000|161438.12712088483|
|(34,[0,24,25,26,2...|149000| 155522.2872966209|
|(34,[7,24,25,26,2..

## evaluation

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")

# RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 33564.061446484004


## 결과 저장

In [28]:
predictions.select('id','prediction')\
        .withColumnRenamed('prediction', 'salePrice')\
        .write.csv('data.output/house_prediction.csv', header=True, mode='overwrite')

## 예측 모델의 활용

In [35]:
# 파이프라인, 모델 저장
model_save_path = 'data/output/boston_housing_lr_model'
pipeline_save_path = 'data/ouptut/boston_housing_pipeline_model'
pipeline_model.write().overwrite().save(pipeline_save_path)
lr_model.write().overwrite().save(model_save_path)
print('model saved..')

model saved..


## 모델, 파이프라인 로드

In [36]:
from pyspark.ml import PipelineModel
from pyspark.ml.regression import LinearRegressionModel

loaded_pipeline = PipelineModel.load(pipeline_save_path)
loaded_model = LinearRegressionModel.load(model_save_path)

## 새로운 데이터 예측

In [38]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m183.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
Downloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.0.3 pytz-2024.2 tzdata-2024.2


In [39]:
import pandas as pd

# 새로운 데이터 샘플 생성
data = {
    "Id": [1461],
    "MSSubClass": [20],
    "MSZoning": ["RH"],
    "LotFrontage": [80],
    "LotArea": [11622],
    "Street": ["Pave"],
    "Alley": [None],  # NA를 None으로 표현
    "LotShape": ["Reg"],
    "LandContour": ["Lvl"],
    "Utilities": ["AllPub"],
    "LotConfig": ["Inside"],
    "LandSlope": ["Gtl"],
    "Neighborhood": ["NAmes"],
    "Condition1": ["Feedr"],
    "Condition2": ["Norm"],
    "BldgType": ["1Fam"],
    "HouseStyle": ["1Story"],
    "OverallQual": [5],
    "OverallCond": [6],
    "YearBuilt": [1961],
    "YearRemodAdd": [1961],
    "RoofStyle": ["Gable"],
    "RoofMatl": ["CompShg"],
    "Exterior1st": ["VinylSd"],
    "Exterior2nd": ["VinylSd"],
    "MasVnrType": [None],  # None은 NA를 의미
    "MasVnrArea": [0],
    "ExterQual": ["TA"],
    "ExterCond": ["TA"],
    "Foundation": ["CBlock"],
    "BsmtQual": ["TA"],
    "BsmtCond": ["TA"],
    "BsmtExposure": ["No"],
    "BsmtFinType1": ["Rec"],
    "BsmtFinSF1": [468],
    "BsmtFinType2": ["LwQ"],
    "BsmtFinSF2": [144],
    "BsmtUnfSF": [270],
    "TotalBsmtSF": [882],
    "Heating": ["GasA"],
    "HeatingQC": ["TA"],
    "CentralAir": ["Y"],
    "Electrical": ["SBrkr"],
    "1stFlrSF": [896],
    "2ndFlrSF": [0],
    "LowQualFinSF": [0],
    "GrLivArea": [896],
    "BsmtFullBath": [0],
    "BsmtHalfBath": [0],
    "FullBath": [1],
    "HalfBath": [0],
    "BedroomAbvGr": [2],
    "KitchenAbvGr": [1],
    "KitchenQual": ["TA"],
    "TotRmsAbvGrd": [5],
    "Functional": ["Typ"],
    "Fireplaces": [0],
    "FireplaceQu": [None],  # NA를 None으로 표현
    "GarageType": ["Attchd"],
    "GarageYrBlt": [1961],
    "GarageFinish": ["Unf"],
    "GarageCars": [1],
    "GarageArea": [730],
    "GarageQual": ["TA"],
    "GarageCond": ["TA"],
    "PavedDrive": ["Y"],
    "WoodDeckSF": [140],
    "OpenPorchSF": [0],
    "EnclosedPorch": [0],
    "3SsnPorch": [0],
    "ScreenPorch": [120],
    "PoolArea": [0],
    "PoolQC": [None],  # NA를 None으로 표현
    "Fence": ["MnPrv"],
    "MiscFeature": [None],  # NA를 None으로 표현
    "MiscVal": [0],
    "MoSold": [6],
    "YrSold": [2010],
    "SaleType": ["WD"],
    "SaleCondition":["Normal"]
}

pd.DataFrame(data).to_csv('data/new_test_data.csv', index=False)

In [40]:
new_test_data = spark.read.csv('data/new_test_data.csv', header=True, inferSchema=True)

In [41]:
selected_features = [
    "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", 
    "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageCars", "GarageArea", "Neighborhood"
]

In [42]:
new_test_data = new_test_data.withColumn("GarageCars", new_test_data["GarageCars"].cast("integer"))
new_test_data = new_test_data.withColumn("GarageArea", new_test_data["GarageArea"].cast("integer"))

In [43]:
new_pipe_data = loaded_pipeline.transform(new_test_data)
new_pred = loaded_model.transform( new_pipe_data )

In [44]:
new_pred.select('prediction').show()

+------------------+
|        prediction|
+------------------+
|114113.60325331613|
+------------------+



In [45]:
spark.stop()