# 1.Verilerin Yüklenmesi


Spark session başlatılarak veriseti çekilir.

In [3]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("HousingPricePrediction").getOrCreate()

data = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/spark/housing.csv", header=True, inferSchema=True)



# 2.Verilerin İncelenmesi

Veriseti, sütunlardaki verilerin ortalama, medyan, maximum, minimum gibi değerleri baz alınarak incelenmiştir.

In [4]:
data.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR BAY|
|  -122.25|   37.85|              

In [5]:
data.describe().show()

+-------+-------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+
|summary|          longitude|         latitude|housing_median_age|       total_rooms|    total_bedrooms|        population|       households|     median_income|median_house_value|ocean_proximity|
+-------+-------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+
|  count|              20640|            20640|             20640|             20640|             20433|             20640|            20640|             20640|             20640|          20640|
|   mean|-119.56970445736148| 35.6318614341087|28.639486434108527|2635.7630813953488| 537.8705525375618|1425.4767441860465|499.5396802325581|3.8706710029070246|206855.81690891474|           NULL|
| stddev|  2.0035317

In [7]:
data.summary().show()

+-------+-------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+
|summary|          longitude|         latitude|housing_median_age|       total_rooms|    total_bedrooms|        population|       households|     median_income|median_house_value|ocean_proximity|
+-------+-------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+
|  count|              20640|            20640|             20640|             20640|             20433|             20640|            20640|             20640|             20640|          20640|
|   mean|-119.56970445736148| 35.6318614341087|28.639486434108527|2635.7630813953488| 537.8705525375618|1425.4767441860465|499.5396802325581|3.8706710029070246|206855.81690891474|           NULL|
| stddev|  2.0035317

Sütunlardaki eşşiz değerlerin sayısı incelenmiştir.

In [10]:
for col_name in data.columns:
    distinct_count = data.select(col_name).distinct().count()
    print(f"{col_name}: {distinct_count} unique values")


longitude: 844 unique values
latitude: 862 unique values
housing_median_age: 52 unique values
total_rooms: 5926 unique values
total_bedrooms: 1924 unique values
population: 3888 unique values
households: 1815 unique values
median_income: 12928 unique values
median_house_value: 3842 unique values
ocean_proximity: 5 unique values


Her sütundaki boş değerler incelelmiştir.

In [13]:
from pyspark.sql.functions import col, count, when
data.select([count(when(col(c).isNull(), c)).alias(c) for c in data.columns]).show()


+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|        0|       0|                 0|          0|           207|         0|         0|            0|                 0|              0|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+



# 3.Özniteliklerin Seçimi ve Verilerin Makine Öğrenmesi İçin Hazırlanması

StringIndexer kullanılarak kategorik değişken olan "ocean_proximity" değişkeni sayısal bir değişkene dönüştürülmüştür.

In [15]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

indexer = StringIndexer(inputCol="ocean_proximity",outputCol="ocean_proximity_index")
data = indexer.fit(data).transform(data)

In [19]:
data.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|ocean_proximity_index|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|                  3.0|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|                  3.0|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|                  3.0|
|  -122.25|   37.85|              52.0| 

VectorAssembler ile özellikler ( bağımısz değişkenler ) belirlenerek hepsi bir vektör formatına dönüştürülmüştür. "median_house_value" sütunu da tahmin edilecek değişken olarak belirlenmiştir.( bağımlı değişken)

In [20]:
assembler = VectorAssembler(
    inputCols=["longitude","latitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income","ocean_proximity_index"],
    outputCol="features", handleInvalid="skip"
)
data = assembler.transform(data)

final_data= data.select("features","median_house_value")
final_data.show()

+--------------------+------------------+
|            features|median_house_value|
+--------------------+------------------+
|[-122.23,37.88,41...|          452600.0|
|[-122.22,37.86,21...|          358500.0|
|[-122.24,37.85,52...|          352100.0|
|[-122.25,37.85,52...|          341300.0|
|[-122.25,37.85,52...|          342200.0|
|[-122.25,37.85,52...|          269700.0|
|[-122.25,37.84,52...|          299200.0|
|[-122.25,37.84,52...|          241400.0|
|[-122.26,37.84,42...|          226700.0|
|[-122.25,37.84,52...|          261100.0|
|[-122.26,37.85,52...|          281500.0|
|[-122.26,37.85,52...|          241800.0|
|[-122.26,37.85,52...|          213500.0|
|[-122.26,37.84,52...|          191300.0|
|[-122.26,37.85,52...|          159200.0|
|[-122.26,37.85,50...|          140000.0|
|[-122.27,37.85,52...|          152500.0|
|[-122.27,37.85,52...|          155500.0|
|[-122.26,37.84,50...|          158700.0|
|[-122.27,37.84,52...|          162900.0|
+--------------------+------------

# 4.PySpark ile Makine Öğrenmesi Modelinin Oluşturulması

final_data, eğitim ve test verisi olarak ikiye bölünür. Doğrusal Regresyon modeli tanımlanır ve fit fonksiyonu ile model eğitilir.

In [21]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = final_data.randomSplit([0.8, 0.2])
lr = LinearRegression(featuresCol="features", labelCol="median_house_value")
lr_model = lr.fit(train_data)


Model verisi evaluate fonksiyonu ile test verisi üzerinden değerlendirilir. Root mean squared error ve R2 değerleri hesaplanır.

In [22]:
test_results = lr_model.evaluate(test_data)
print("Root Mean Squared Error (RMSE):", test_results.rootMeanSquaredError)
print("R2:", test_results.r2)


Root Mean Squared Error (RMSE): 71602.65298060827
R2: 0.6239338349430412
