In [36]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, min
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType

In [2]:
# Sesi Spark
spark = SparkSession.builder.appName('First Session').getOrCreate()

In [3]:
# Baca dataset tanpa header
df = spark.read.option("header", "false").csv("Angkatan kerja.csv")
# Ambil header
header = df.take(1)[0]

# Filter baris dengan menggunakan header sebagai referensi
df = df.filter(df['_c0'] != header['_c0'])

# Tampilkan hasil
df.show()

+---+---------------+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|_c0|            _c1|  _c2|  _c3|  _c4|  _c5|  _c6|  _c7|  _c8|  _c9| _c10|
+---+---------------+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|  1|           ACEH|80.03|80.79|81.47| 79.4|81.06|48.24|45.68|48.94|48.36|
|  2|       Simeulue|79.73| 81.2|83.29|82.29|84.15|48.24|42.86|56.79|59.46|
|  3|   Aceh Singkil|80.54|80.63|82.36|82.46|83.76|45.24|37.25| 41.2|42.85|
|  4|   Aceh Selatan|83.95|82.19|79.75|77.52|84.44|48.05|37.11|44.13|45.14|
|  5|  Aceh Tenggara| 82.5|79.48|   81| 78.7|82.07|63.43|55.69|61.98|60.83|
|  6|     Aceh Timur|   83|82.69|83.39|79.35| 81.8|48.72|36.35|40.78|39.86|
|  7|    Aceh Tengah|82.18|86.65|89.15|85.77| 87.2|62.34|57.94| 70.5|66.55|
|  8|     Aceh Barat|74.14| 80.2|80.02|78.64|   80|33.67|41.08|38.13|40.85|
|  9|     Aceh Besar|75.64|74.71|77.46|75.74|81.44|40.43|36.64|41.63|46.83|
| 10|          Pidie|75.68|78.45|78.98|76.03|79.31|49.07|49.07|53.36|46.08|
| 11|       

In [4]:
data = spark.read.csv('Angkatan kerja.csv', header=True, inferSchema=True)

In [5]:
data.printSchema()

root
 |-- Provinsi/Kabupaten/Kota: integer (nullable = true)
 |-- Tingkat Partisipasi Angkatan Kerja Menurut Jenis Kelamin: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: double (nullable = true)



In [6]:
string_indexer = StringIndexer(inputCol='_c7', outputCol='indexed_c7')
data_indexed = string_indexer.fit(data).transform(data)

In [7]:
selected_columns = ['_c3', '_c4', '_c5', '_c6', 'indexed_c7', '_c8', '_c9', '_c10',]


In [8]:
assembler = VectorAssembler(inputCols=selected_columns, outputCol='features', handleInvalid='skip')
data_assembled = assembler.transform(data_indexed)

In [9]:
kmeans = KMeans().setK(3).setSeed(1)  # Misalnya, dengan 3 kluster
model = kmeans.fit(data_assembled)

In [10]:
predictions = model.transform(data_assembled)

In [11]:
# Misalnya, jika ingin mengetahui kluster dengan nilai tertinggi dari kolom tertentu
max_value_cluster = predictions.groupBy('prediction').max('_c3')  
max_value_cluster.show()

+----------+--------+
|prediction|max(_c3)|
+----------+--------+
|         1|  2019.0|
|         2|   91.56|
|         0|    97.0|
+----------+--------+



In [12]:
# Misalnya, jika ingin mengetahui kluster dengan nilai tertinggi dari kolom tertentu
max_value_cluster = predictions.groupBy('prediction').max('_c4')  
max_value_cluster.show()

+----------+--------+
|prediction|max(_c4)|
+----------+--------+
|         1|  2020.0|
|         2|   90.06|
|         0|    96.2|
+----------+--------+



In [13]:
# Misalnya, jika ingin mengetahui kluster dengan nilai tertinggi dari kolom tertentu
max_value_cluster = predictions.groupBy('prediction').max('_c5')  
max_value_cluster.show()

+----------+--------+
|prediction|max(_c5)|
+----------+--------+
|         1|  2021.0|
|         2|   90.84|
|         0|   97.05|
+----------+--------+



In [14]:
# Misalnya, jika ingin mengetahui kluster dengan nilai tertinggi dari kolom tertentu
max_value_cluster = predictions.groupBy('prediction').max('_c6')  
max_value_cluster.show()

+----------+--------+
|prediction|max(_c6)|
+----------+--------+
|         1|  2022.0|
|         2|   90.07|
|         0|   98.55|
+----------+--------+



In [15]:
# Misalnya, jika ingin mengetahui kluster dengan nilai tertinggi dari kolom tertentu
max_value_cluster = predictions.groupBy('prediction').max('indexed_c7')  
max_value_cluster.show()

+----------+---------------+
|prediction|max(indexed_c7)|
+----------+---------------+
|         1|           38.0|
|         2|          241.0|
|         0|          509.0|
+----------+---------------+



In [16]:
# Misalnya, jika ingin mengetahui kluster dengan nilai tertinggi dari kolom tertentu
max_value_cluster = predictions.groupBy('prediction').max('_c8')  
max_value_cluster.show()

+----------+--------+
|prediction|max(_c8)|
+----------+--------+
|         1|  2019.0|
|         2|   62.39|
|         0|   97.04|
+----------+--------+



In [17]:
# Misalnya, jika ingin mengetahui kluster dengan nilai tertinggi dari kolom tertentu
max_value_cluster = predictions.groupBy('prediction').max('_c9')  
max_value_cluster.show()

+----------+--------+
|prediction|max(_c9)|
+----------+--------+
|         1|  2020.0|
|         2|    72.3|
|         0|   97.24|
+----------+--------+



In [18]:
# Misalnya, jika ingin mengetahui kluster dengan nilai tertinggi dari kolom tertentu
max_value_cluster = predictions.groupBy('prediction').max('_c10')  
max_value_cluster.show()

+----------+---------+
|prediction|max(_c10)|
+----------+---------+
|         1|   2021.0|
|         2|    70.45|
|         0|     99.0|
+----------+---------+



In [19]:
min_value_cluster = predictions.groupBy('prediction').min('_c3')
min_value_cluster.show()


+----------+--------+
|prediction|min(_c3)|
+----------+--------+
|         1|  2019.0|
|         2|   71.17|
|         0|   72.93|
+----------+--------+



In [20]:
min_value_cluster = predictions.groupBy('prediction').min('_c4')
min_value_cluster.show()

+----------+--------+
|prediction|min(_c4)|
+----------+--------+
|         1|  2020.0|
|         2|   67.31|
|         0|   29.01|
+----------+--------+



In [21]:
min_value_cluster = predictions.groupBy('prediction').min('_c5')
min_value_cluster.show()

+----------+--------+
|prediction|min(_c5)|
+----------+--------+
|         1|  2021.0|
|         2|   71.68|
|         0|    73.0|
+----------+--------+



In [22]:
min_value_cluster = predictions.groupBy('prediction').min('_c6')
min_value_cluster.show()

+----------+--------+
|prediction|min(_c6)|
+----------+--------+
|         1|  2022.0|
|         2|   69.37|
|         0|   73.93|
+----------+--------+



In [23]:
min_value_cluster = predictions.groupBy('prediction').min('indexed_c7')
min_value_cluster.show()

+----------+---------------+
|prediction|min(indexed_c7)|
+----------+---------------+
|         1|           38.0|
|         2|            0.0|
|         0|          242.0|
+----------+---------------+



In [24]:
min_value_cluster = predictions.groupBy('prediction').min('_c8')
min_value_cluster.show()

+----------+--------+
|prediction|min(_c8)|
+----------+--------+
|         1|  2019.0|
|         2|   30.28|
|         0|   42.44|
+----------+--------+



In [25]:
min_value_cluster = predictions.groupBy('prediction').min('_c9')
min_value_cluster.show()

+----------+--------+
|prediction|min(_c9)|
+----------+--------+
|         1|  2020.0|
|         2|   31.36|
|         0|   45.03|
+----------+--------+



In [26]:
min_value_cluster = predictions.groupBy('prediction').min('_c10')
min_value_cluster.show()

+----------+---------+
|prediction|min(_c10)|
+----------+---------+
|         1|   2021.0|
|         2|    30.71|
|         0|    42.12|
+----------+---------+



In [46]:
mean_value_cluster = predictions.groupBy('prediction').min('_c3')
mean_value_cluster.show()

+----------+--------+
|prediction|min(_c3)|
+----------+--------+
|         1|  2019.0|
|         2|   71.17|
|         0|   72.93|
+----------+--------+



In [47]:
mean_value_cluster = predictions.groupBy('prediction').min('_c4')
mean_value_cluster.show()

+----------+--------+
|prediction|min(_c4)|
+----------+--------+
|         1|  2020.0|
|         2|   67.31|
|         0|   29.01|
+----------+--------+



In [48]:
mean_value_cluster = predictions.groupBy('prediction').min('_c5')
mean_value_cluster.show()

+----------+--------+
|prediction|min(_c5)|
+----------+--------+
|         1|  2021.0|
|         2|   71.68|
|         0|    73.0|
+----------+--------+



In [49]:
mean_value_cluster = predictions.groupBy('prediction').min('_c6')
mean_value_cluster.show()

+----------+--------+
|prediction|min(_c6)|
+----------+--------+
|         1|  2022.0|
|         2|   69.37|
|         0|   73.93|
+----------+--------+



In [51]:
mean_value_cluster = predictions.groupBy('prediction').min('indexed_c7')
mean_value_cluster.show()

+----------+---------------+
|prediction|min(indexed_c7)|
+----------+---------------+
|         1|           38.0|
|         2|            0.0|
|         0|          242.0|
+----------+---------------+



In [52]:
mean_value_cluster = predictions.groupBy('prediction').min('_c8')
mean_value_cluster.show()

+----------+--------+
|prediction|min(_c8)|
+----------+--------+
|         1|  2019.0|
|         2|   30.28|
|         0|   42.44|
+----------+--------+



In [53]:
mean_value_cluster = predictions.groupBy('prediction').min('_c9')
mean_value_cluster.show()

+----------+--------+
|prediction|min(_c9)|
+----------+--------+
|         1|  2020.0|
|         2|   31.36|
|         0|   45.03|
+----------+--------+



In [54]:
mean_value_cluster = predictions.groupBy('prediction').min('_c10')
mean_value_cluster.show()

+----------+---------+
|prediction|min(_c10)|
+----------+---------+
|         1|   2021.0|
|         2|    30.71|
|         0|    42.12|
+----------+---------+

