### Importando as bibliotecas

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

### Configurações do banco de dados

In [None]:
# Configurações do MySQL a partir das variáveis de ambiente
mysql_host = os.getenv('MYSQL_HOST')
mysql_port = int(os.getenv('MYSQL_PORT'))
mysql_user = os.getenv('MYSQL_USER')
mysql_password = os.getenv('MYSQL_PASSWORD')
mysql_db = os.getenv('MYSQL_DB')
mysql_table = os.getenv('MYSQL_TABLE')


### Reading the dataset

In [None]:

df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/estevanlima16@gmail.com/anaemia_prediction.csv")



In [None]:
# Exibir o DataFrame para verificar se os dados foram lidos corretamente
df.show()

+------+---+----------+------------+-----------+----+-------+
|Number|Sex|%Red Pixel|%Green pixel|%Blue pixel|  Hb|Anaemic|
+------+---+----------+------------+-----------+----+-------+
|     1|  M|   43.2555|     30.8421|    25.9025| 6.3|    Yes|
|     2|  F|   45.6033|       28.19|    26.2067|13.5|     No|
|     3| F |   45.0107|     28.9677|    26.0215|11.7|     No|
|     4|  F|   44.5398|     28.9899|    26.4703|13.5|     No|
|     5| M |    43.287|     30.6972|    26.0158|12.4|     No|
|     6|  M|   45.0994|     27.9645|    26.9361|16.2|     No|
|     7|  F|   43.1457|     30.1628|    26.6915| 8.6|    Yes|
|     8| F |   43.6103|     29.1099|    27.2798|10.3|     No|
|     9|  F|   45.0423|      29.166|    25.7918|  13|     No|
|    10|  F|   46.5143|     27.4282|    26.0575| 9.7|    Yes|
|    11|  F|   45.3506|     29.1248|    25.5246|12.6|     No|
|    12|  F|   44.4062|     28.9298|     26.664|15.4|     No|
|    13|  F|   44.9642|     30.5279|    24.5079| 4.8|    Yes|
|    14|

### Preparing data

In [None]:
# Adicionar a coluna de sex_id ao DataFrame principal
fact_df = df.withColumn("sex_id", when(col("Sex") == "M", 1).otherwise(2))

# Selecionar e renomear colunas conforme necessário
fact_df = fact_df.select(
    col("Number").alias("case_id"),
    col("sex_id"),
    col("Hb").alias("hemoglobin_level"),
    col("%Red Pixel").alias("red_pixel_percentage"),
    col("%Green pixel").alias("green_pixel_percentage"),
    col("%Blue pixel").alias("blue_pixel_percentage"),
    when(col("Anaemic") == "Yes", 1).otherwise(0).alias("anaemic_status")
)

# Mostrar o DataFrame para verificar
fact_df.show()

+-------+------+----------------+--------------------+----------------------+---------------------+--------------+
|case_id|sex_id|hemoglobin_level|red_pixel_percentage|green_pixel_percentage|blue_pixel_percentage|anaemic_status|
+-------+------+----------------+--------------------+----------------------+---------------------+--------------+
|      1|     1|             6.3|             43.2555|               30.8421|              25.9025|             1|
|      2|     2|            13.5|             45.6033|                 28.19|              26.2067|             0|
|      3|     2|            11.7|             45.0107|               28.9677|              26.0215|             0|
|      4|     2|            13.5|             44.5398|               28.9899|              26.4703|             0|
|      5|     2|            12.4|              43.287|               30.6972|              26.0158|             0|
|      6|     1|            16.2|             45.0994|               27.9645|   

### Loading data into fact table named 'fact_anaemia'

In [None]:
# Conectar ao MySQL e inserir os dados
fact_df.write \
    .format("mysql") \
    .option("host", mysql_host) \
    .option("database", mysql_db) \
    .option("dbtable", "fact_anaemia") \
    .option("user", mysql_user) \
    .option("password", mysql_password) \
    .save()