<a href="https://colab.research.google.com/github/faizdifak/Big-Data/blob/main/Praktik_data_processing_dengan_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Tugas 1 Buat DataFrame sederhana dan Operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Profesi').getOrCreate()

data = [('Ridho', 'ASN', 1000),
        ('Rama', 'Karyawan', 2000),
        ('Ratu', 'Dokter', 5100),
        ('Sintha', 'Guru', 4000),
        ('Zeni', 'Sales', 3000)]
columns = ['EmployeeName', 'Profesi', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

+------------+--------+------+
|EmployeeName| Profesi|Salary|
+------------+--------+------+
|       Ridho|     ASN|  1000|
|        Rama|Karyawan|  2000|
|        Ratu|  Dokter|  5100|
|      Sintha|    Guru|  4000|
|        Zeni|   Sales|  3000|
+------------+--------+------+



In [None]:
# Tugas 2 Gunakan Operasi filter, select, groupBy
# Operasi Transformasi DataFrame
df.select('EmployeeName', 'Salary').show()
df.filter(df['Salary'] > 3000).show()
df.groupBy('Profesi').avg('Salary').show()

+------------+------+
|EmployeeName|Salary|
+------------+------+
|       Ridho|  1000|
|        Rama|  2000|
|        Ratu|  5100|
|      Sintha|  4000|
|        Zeni|  3000|
+------------+------+

+------------+-------+------+
|EmployeeName|Profesi|Salary|
+------------+-------+------+
|        Ratu| Dokter|  5100|
|      Sintha|   Guru|  4000|
+------------+-------+------+

+--------+-----------+
| Profesi|avg(Salary)|
+--------+-----------+
|     ASN|     1000.0|
|Karyawan|     2000.0|
|   Sales|     3000.0|
|    Guru|     4000.0|
|  Dokter|     5100.0|
+--------+-----------+



In [None]:
# Tugas 3 Eksplorasi bagaimana mengolah tipe data kompleks
# Manipulasi tipe data kompleks
df_with_bonus = df.withColumn('SalaryBonus', df['Salary'] * 0.1)
df_with_bonus.withColumn('TotalCompensation', df_with_bonus['Salary'] + df_with_bonus['SalaryBonus']).show()

+------------+--------+------+-----------+-----------------+
|EmployeeName| Profesi|Salary|SalaryBonus|TotalCompensation|
+------------+--------+------+-----------+-----------------+
|       Ridho|     ASN|  1000|      100.0|           1100.0|
|        Rama|Karyawan|  2000|      200.0|           2200.0|
|        Ratu|  Dokter|  5100|      510.0|           5610.0|
|      Sintha|    Guru|  4000|      400.0|           4400.0|
|        Zeni|   Sales|  3000|      300.0|           3300.0|
+------------+--------+------+-----------+-----------------+



In [None]:
# Tugas 4 Implementasikan windows function untuk menghitung running totals atau rangkings
# Menggunakan windows function
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Profesi').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+------------+--------+------+----+
|EmployeeName| Profesi|Salary|Rank|
+------------+--------+------+----+
|       Ridho|     ASN|  1000|   1|
|        Ratu|  Dokter|  5100|   1|
|      Sintha|    Guru|  4000|   1|
|        Rama|Karyawan|  2000|   1|
|        Zeni|   Sales|  3000|   1|
+------------+--------+------+----+

