### Instalando bibliotecas necessárias e criando o contexto Spark

In [2]:
import findspark

findspark.init()

In [99]:
from pyspark import SparkContext, SparkConf
import wget
from pyspark.sql import SparkSession
import os
import pyspark.sql.functions as f
import pyspark.sql.types as t

In [8]:

# Creating a SparkSession  

# Definir o caminho do Spark corretamente
os.environ['SPARK_HOME'] = '/home/daiane/spark-3.5.1-bin-hadoop3/'

# Definir o caminho do Java corretamente
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.17.0-openjdk-amd64/'

# Iniciar uma sessão Spark
spark = SparkSession.builder \
    .appName("Exemplo Spark") \
    .getOrCreate()

# Testar a sessão Spark
spark


24/04/25 11:55:17 WARN Utils: Your hostname, victor-Lenovo-ideapad-330-15IKB resolves to a loopback address: 127.0.1.1; using 192.168.1.74 instead (on interface wlp2s0)
24/04/25 11:55:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/25 11:55:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Download do arquivo CSV

In [10]:
wget.download("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/data/employees.csv")

100% [............................................................] 1321 / 1321

'employees.csv'

#### Task 1: Gerar um DataFrame e esquema do Spark a partir dos dados em formato CSV

In [107]:
schema = StructType([
    StructField("emp_id", t.IntegerType(), True),
    StructField("emp_name", t.StringType(), True),
    StructField("emp_salary", t.IntegerType(), True),
    StructField("emp_age", t.IntegerType(), True),
    StructField("emp_department", t.StringType(), True),
])
employees_df = spark.read.csv("employees.csv", schema=schema, header=False)
employees_df = employees_df.rdd.zipWithIndex().filter(lambda x: x[1] > 0).keys().toDF(schema)

In [56]:
employees_df.printSchema()
employees_df.show()

root
 |-- emp_id: integer (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- emp_salary: integer (nullable = true)
 |-- emp_age: integer (nullable = true)
 |-- emp_department: string (nullable = true)

+------+---------+----------+-------+--------------+
|emp_id| emp_name|emp_salary|emp_age|emp_department|
+------+---------+----------+-------+--------------+
|   198|   Donald|      2600|     29|            IT|
|   199|  Douglas|      2600|     34|         Sales|
|   200| Jennifer|      4400|     36|     Marketing|
|   201|  Michael|     13000|     32|            IT|
|   202|      Pat|      6000|     39|            HR|
|   203|    Susan|      6500|     36|     Marketing|
|   204|  Hermann|     10000|     29|       Finance|
|   205|  Shelley|     12008|     33|       Finance|
|   206|  William|      8300|     37|            IT|
|   100|   Steven|     24000|     39|            IT|
|   101|    Neena|     17000|     27|         Sales|
|   102|      Lex|     17000|     37|     Ma

### Task 2: Criar visualização temporária

In [59]:
employees_df.createOrReplaceTempView("employees")

In [60]:
spark.sql("SELECT * FROM employees").show()
spark.sql("select count(*) as total_rwos from employees").show()

+------+---------+----------+-------+--------------+
|emp_id| emp_name|emp_salary|emp_age|emp_department|
+------+---------+----------+-------+--------------+
|   198|   Donald|      2600|     29|            IT|
|   199|  Douglas|      2600|     34|         Sales|
|   200| Jennifer|      4400|     36|     Marketing|
|   201|  Michael|     13000|     32|            IT|
|   202|      Pat|      6000|     39|            HR|
|   203|    Susan|      6500|     36|     Marketing|
|   204|  Hermann|     10000|     29|       Finance|
|   205|  Shelley|     12008|     33|       Finance|
|   206|  William|      8300|     37|            IT|
|   100|   Steven|     24000|     39|            IT|
|   101|    Neena|     17000|     27|         Sales|
|   102|      Lex|     17000|     37|     Marketing|
|   103|Alexander|      9000|     39|     Marketing|
|   104|    Bruce|      6000|     38|            IT|
|   105|    David|      4800|     39|            IT|
|   106|    Valli|      4800|     38|         

### Task 3: Executar consulta SQL para buscar os registros da visualização 'employees' onde a idade dos funcionários seja superior a 30

In [61]:
spark.sql("select * from employees where emp_age > 30").show()  
spark.sql("select count(*) from employees where emp_age > 30").show()

+------+-----------+----------+-------+--------------+
|emp_id|   emp_name|emp_salary|emp_age|emp_department|
+------+-----------+----------+-------+--------------+
|   199|    Douglas|      2600|     34|         Sales|
|   200|   Jennifer|      4400|     36|     Marketing|
|   201|    Michael|     13000|     32|            IT|
|   202|        Pat|      6000|     39|            HR|
|   203|      Susan|      6500|     36|     Marketing|
|   205|    Shelley|     12008|     33|       Finance|
|   206|    William|      8300|     37|            IT|
|   100|     Steven|     24000|     39|            IT|
|   102|        Lex|     17000|     37|     Marketing|
|   103|  Alexander|      9000|     39|     Marketing|
|   104|      Bruce|      6000|     38|            IT|
|   105|      David|      4800|     39|            IT|
|   106|      Valli|      4800|     38|         Sales|
|   107|      Diana|      4200|     35|         Sales|
|   109|     Daniel|      9000|     35|            HR|
|   110|  

### Task 4: Exibir salário médio dos funcionários agrupados por departamento

In [68]:
spark.sql("select emp_department, round(avg(emp_salary), 2) as avg_salary from employees group by emp_department order by avg_salary desc").show()

+--------------+----------+
|emp_department|avg_salary|
+--------------+----------+
|            IT|    7400.0|
|     Marketing|   6633.33|
|            HR|    5837.5|
|       Finance|    5730.8|
|         Sales|   5492.92|
+--------------+----------+



### Task 5: Filtrar departamento de TI

In [69]:
spark.sql("select * from employees where emp_department = 'IT'").show()

+------+--------+----------+-------+--------------+
|emp_id|emp_name|emp_salary|emp_age|emp_department|
+------+--------+----------+-------+--------------+
|   198|  Donald|      2600|     29|            IT|
|   201| Michael|     13000|     32|            IT|
|   206| William|      8300|     37|            IT|
|   100|  Steven|     24000|     39|            IT|
|   104|   Bruce|      6000|     38|            IT|
|   105|   David|      4800|     39|            IT|
|   111|  Ismael|      7700|     32|            IT|
|   129|   Laura|      3300|     38|            IT|
|   132|      TJ|      2100|     34|            IT|
|   136|   Hazel|      2200|     29|            IT|
+------+--------+----------+-------+--------------+



### Task 6: adicionar nova coluna com salário + bônus 10%

In [101]:
employees_df = employees_df.withColumn("SalaryAfterBonus", f.round(employees_df["emp_salary"] * 1.10, 2))

### Task 7: Retornar maior salário para cada idade

In [102]:
employees_df.groupBy('emp_age').agg(f.max('emp_salary').alias('max_salary')).orderBy('emp_age').show()

+-------+----------+
|emp_age|max_salary|
+-------+----------+
|     26|      3600|
|     27|     17000|
|     28|     12008|
|     29|     10000|
|     30|      8000|
|     31|      8200|
|     32|     13000|
|     33|     12008|
|     34|      7800|
|     35|      9000|
|     36|      7900|
|     37|     17000|
|     38|      6000|
|     39|     24000|
+-------+----------+



### Task 8

In [79]:
joined_df = employees_df.join(employees_df, employees_df["emp_id"] == employees_df["emp_id"], how="inner")
joined_df.show()

24/04/25 17:11:16 WARN Column: Constructing trivially true equals predicate, 'emp_id#487 = emp_id#487'. Perhaps you need to use aliases.


+------+-----------+----------+-------+--------------+------------------+------+-----------+----------+-------+--------------+------------------+
|emp_id|   emp_name|emp_salary|emp_age|emp_department|  SalaryAfterBonus|emp_id|   emp_name|emp_salary|emp_age|emp_department|  SalaryAfterBonus|
+------+-----------+----------+-------+--------------+------------------+------+-----------+----------+-------+--------------+------------------+
|   137|     Renske|      3600|     26|     Marketing|3960.0000000000005|   137|     Renske|      3600|     26|     Marketing|3960.0000000000005|
|   133|      Jason|      3300|     38|         Sales|3630.0000000000005|   133|      Jason|      3300|     38|         Sales|3630.0000000000005|
|   108|      Nancy|     12008|     28|         Sales|13208.800000000001|   108|      Nancy|     12008|     28|         Sales|13208.800000000001|
|   101|      Neena|     17000|     27|         Sales|           18700.0|   101|      Neena|     17000|     27|         Sale

### Task 9: Calcular idade média dos funcionários

In [103]:
employees_df.agg(f.avg('emp_age').alias('avg_age')).show()

+-------+
|avg_age|
+-------+
|  33.56|
+-------+



### Task 10: Salário total por departamento

In [104]:
employees_df.groupBy('emp_department').agg(f.sum('emp_salary').alias('total_amount')).show()

+--------------+------------+
|emp_department|total_amount|
+--------------+------------+
|         Sales|       71408|
|            HR|       46700|
|       Finance|       57308|
|     Marketing|       59700|
|            IT|       74000|
+--------------+------------+



### Task 11: Ordenar o DataFrame por idade em ordem crescente e por salário em ordem decrescente.

In [105]:
employees_sorted_df = employees_df.orderBy(f.asc('emp_age'), f.desc('emp_salary'))

### Task 12: Calcular quantidade de funcionários por departamento

In [106]:
employees_df.groupBy('emp_department').agg(f.count('emp_id').alias('emp_per_department')).show()

+--------------+------------------+
|emp_department|emp_per_department|
+--------------+------------------+
|         Sales|                13|
|            HR|                 8|
|       Finance|                10|
|     Marketing|                 9|
|            IT|                10|
+--------------+------------------+



### Task 13: Exibir apenas funcionários que tem a letra "o" no nome

In [93]:
employees_df.filter(employees_df.emp_name.contains('o')).show()

+------+-----------+----------+-------+--------------+----------------+
|emp_id|   emp_name|emp_salary|emp_age|emp_department|SalaryAfterBonus|
+------+-----------+----------+-------+--------------+----------------+
|   198|     Donald|      2600|     29|            IT|          2860.0|
|   199|    Douglas|      2600|     34|         Sales|          2860.0|
|   110|       John|      8200|     31|     Marketing|          9020.0|
|   112|Jose Manuel|      7800|     34|            HR|          8580.0|
|   130|      Mozhe|      2800|     28|     Marketing|          3080.0|
|   133|      Jason|      3300|     38|         Sales|          3630.0|
|   139|       John|      2700|     36|         Sales|          2970.0|
|   140|     Joshua|      2500|     29|       Finance|          2750.0|
+------+-----------+----------+-------+--------------+----------------+

