In [None]:
pip install delta-spark==2.4.0

In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import *

In [2]:
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPratico/bronze/MentalHealth"

from pyspark.sql.types import TimestampType
#define the schema for the dataframe
customSchema = StructType([
    StructField("TimeStamp", TimestampType(), True),        
    StructField("Age", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("state", StringType(), True),
    StructField("self_employed", StringType(), True),
    StructField("family_history", StringType(), True),
    StructField("treatment", StringType(), True),
    StructField("work_interfere", StringType(), True),
    StructField("no_employees", StringType(), True),
    StructField("remote_worker", StringType(), True),
    StructField("tech_company", StringType(), True),
    StructField("benefits",StringType(), True),
    StructField("care_options", StringType(), True),
    StructField("Wellness_program", StringType(), True),
    StructField("Seek_help", StringType(), True),
    StructField("anonymity", StringType(), True),
    StructField("leave", StringType(), True),
    StructField("mental_health_consequence",StringType(), True),
    StructField("phys_health_consequence", StringType(), True),
    StructField("coworkers", StringType(), True),
    StructField("supervisor", StringType(), True),
    StructField("mental_health_interview",StringType(), True),
    StructField("phys_health_interview", StringType(), True),
    StructField("mental_vs_physical", StringType(), True),
    StructField("observed_consequence", StringType(), True),
    StructField("comments", StringType(), True),
])

mentalHealth_df = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .option("dateFormat", "dd/MM/yyyy HH:mm")\
            .schema(customSchema) \
            .csv(hdfs_path)
mentalHealth_df.show()
mentalHealth_df.printSchema()

In [None]:
#Torna a coluna timestamp que tem incluem data e hora, só para que inclua apenas a data
mentalHealth_df = mentalHealth_df.withColumn("TimeStamp", to_date("TimeStamp"))
mentalHealth_df.show()

In [5]:
#Faz drop de colunas que achamos desnecessários
mentalHealth_df = mentalHealth_df.drop("state", "no_employees", "mental_vs_physical", "observed_consequence", "self_employed", "comments")

In [6]:
#Coloca só as idades abaixo de 72 e acima de 16, acima desse valor ou abaixo é um erro 

idade_filtrada=mentalHealth_df.filter((mentalHealth_df.Age<72) & (mentalHealth_df.Age>16))

idade_filtrada.toPandas()

Unnamed: 0,TimeStamp,Age,Gender,Country,family_history,treatment,work_interfere,remote_worker,tech_company,benefits,...,Wellness_program,Seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview
0,2014-08-27,37,Female,United States,No,Yes,Often,No,Yes,Yes,...,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe
1,2014-08-27,44,M,United States,No,No,Rarely,No,No,Don't know,...,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No
2,2014-08-27,32,Male,Canada,No,No,Rarely,No,Yes,No,...,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes
3,2014-08-27,31,Male,United Kingdom,Yes,Yes,Often,No,Yes,No,...,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe
4,2014-08-27,31,Male,United States,No,No,Never,Yes,Yes,Yes,...,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,2015-09-12,26,male,United Kingdom,No,Yes,,No,Yes,No,...,No,No,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No
1246,2015-09-26,32,Male,United States,Yes,Yes,Often,Yes,Yes,Yes,...,No,No,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No
1247,2015-11-07,34,male,United States,Yes,Yes,Sometimes,No,Yes,Yes,...,No,No,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No
1248,2015-11-30,46,f,United States,No,No,,Yes,Yes,No,...,No,No,Don't know,Don't know,Yes,No,No,No,No,No


In [7]:
#Generos não podem ser diferentes de male ou female, não vamos utilizar géneros indefinidos

genero_filtrado = idade_filtrada.filter(mentalHealth_df["Gender"].isin('female', 'f', 'male', 'm', 'woman', 'man', 'Male', 'Female'))

genero_filtrado.toPandas()


Unnamed: 0,TimeStamp,Age,Gender,Country,family_history,treatment,work_interfere,remote_worker,tech_company,benefits,...,Wellness_program,Seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview
0,2014-08-27,37,Female,United States,No,Yes,Often,No,Yes,Yes,...,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe
1,2014-08-27,32,Male,Canada,No,No,Rarely,No,Yes,No,...,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes
2,2014-08-27,31,Male,United Kingdom,Yes,Yes,Often,No,Yes,No,...,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe
3,2014-08-27,31,Male,United States,No,No,Never,Yes,Yes,Yes,...,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes
4,2014-08-27,33,Male,United States,Yes,No,Sometimes,No,Yes,Yes,...,No,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043,2015-09-12,26,male,United Kingdom,No,Yes,,No,Yes,No,...,No,No,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No
1044,2015-09-26,32,Male,United States,Yes,Yes,Often,Yes,Yes,Yes,...,No,No,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No
1045,2015-11-07,34,male,United States,Yes,Yes,Sometimes,No,Yes,Yes,...,No,No,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No
1046,2015-11-30,46,f,United States,No,No,,Yes,Yes,No,...,No,No,Don't know,Don't know,Yes,No,No,No,No,No


In [None]:
#Tornar todos os genero que sejam ou "Male", "m", "man" para apenas "male"
normalizar_data_male = (
    genero_filtrado
    .withColumn(
        "Gender",
        when((genero_filtrado["Gender"].isin("male", "m", "man", "Male")), "Male")
        .otherwise(genero_filtrado["Gender"])
    )
)
normalizar_data_male.show()

In [9]:
# Tornar todos os genero que sejam ou "female", "f", "woman" para apenas "female"
normalizar_data_female = (
    normalizar_data_male
    .withColumn(
        "Gender",
        when((normalizar_data_male["Gender"].isin("female", "f", "woman", "Female")), "Female")
        .otherwise(normalizar_data_male["Gender"])
    )
)
normalizar_data_female.toPandas()

Unnamed: 0,TimeStamp,Age,Gender,Country,family_history,treatment,work_interfere,remote_worker,tech_company,benefits,...,Wellness_program,Seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview
0,2014-08-27,37,Female,United States,No,Yes,Often,No,Yes,Yes,...,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe
1,2014-08-27,32,Male,Canada,No,No,Rarely,No,Yes,No,...,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes
2,2014-08-27,31,Male,United Kingdom,Yes,Yes,Often,No,Yes,No,...,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe
3,2014-08-27,31,Male,United States,No,No,Never,Yes,Yes,Yes,...,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes
4,2014-08-27,33,Male,United States,Yes,No,Sometimes,No,Yes,Yes,...,No,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043,2015-09-12,26,Male,United Kingdom,No,Yes,,No,Yes,No,...,No,No,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No
1044,2015-09-26,32,Male,United States,Yes,Yes,Often,Yes,Yes,Yes,...,No,No,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No
1045,2015-11-07,34,Male,United States,Yes,Yes,Sometimes,No,Yes,Yes,...,No,No,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No
1046,2015-11-30,46,Female,United States,No,No,,Yes,Yes,No,...,No,No,Don't know,Don't know,Yes,No,No,No,No,No


In [11]:
#Faço select so com as colunas que eu tenho, as que eu removi nao vou utilizarn
#Guarda o dataframe num parquet no deltaTable
normalizar_data_female \
    .select("TimeStamp","Age", "Gender" ,"Country", "family_history", "treatment", "work_interfere" , "remote_worker", "tech_company", "benefits", "care_options", "Wellness_program", "Seek_help",
           "anonymity", "leave", "mental_health_consequence", "phys_health_consequence", "coworkers", "supervisor", "mental_health_interview", "phys_health_interview") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/MentalHealth_DeltaTable/")

In [None]:
deltaTable = DeltaTable.forPath(spark, "hdfs://hdfs-nn:9000/TrabalhoPratico/silver/MentalHealth_DeltaTable")

deltaTable.history().show()

In [12]:
normalizar_data_female.toPandas()

Unnamed: 0,TimeStamp,Age,Gender,Country,family_history,treatment,work_interfere,remote_worker,tech_company,benefits,...,Wellness_program,Seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview
0,2014-08-27,37,Female,United States,No,Yes,Often,No,Yes,Yes,...,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe
1,2014-08-27,32,Male,Canada,No,No,Rarely,No,Yes,No,...,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes
2,2014-08-27,31,Male,United Kingdom,Yes,Yes,Often,No,Yes,No,...,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe
3,2014-08-27,31,Male,United States,No,No,Never,Yes,Yes,Yes,...,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes
4,2014-08-27,33,Male,United States,Yes,No,Sometimes,No,Yes,Yes,...,No,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043,2015-09-12,26,Male,United Kingdom,No,Yes,,No,Yes,No,...,No,No,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No
1044,2015-09-26,32,Male,United States,Yes,Yes,Often,Yes,Yes,Yes,...,No,No,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No
1045,2015-11-07,34,Male,United States,Yes,Yes,Sometimes,No,Yes,Yes,...,No,No,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No
1046,2015-11-30,46,Female,United States,No,No,,Yes,Yes,No,...,No,No,Don't know,Don't know,Yes,No,No,No,No,No


In [13]:
#Mostra a delta table criada
spark.sql(
    """
    Select *
    from silver.MentalHealth_DeltaTable
    """
).toPandas()


Unnamed: 0,TimeStamp,Age,Gender,Country,family_history,treatment,work_interfere,remote_worker,tech_company,benefits,...,Wellness_program,Seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview
0,2014-08-27,37,Female,United States,No,Yes,Often,No,Yes,Yes,...,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe
1,2014-08-27,32,Male,Canada,No,No,Rarely,No,Yes,No,...,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes
2,2014-08-27,31,Male,United Kingdom,Yes,Yes,Often,No,Yes,No,...,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe
3,2014-08-27,31,Male,United States,No,No,Never,Yes,Yes,Yes,...,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes
4,2014-08-27,33,Male,United States,Yes,No,Sometimes,No,Yes,Yes,...,No,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043,2015-09-12,26,Male,United Kingdom,No,Yes,,No,Yes,No,...,No,No,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No
1044,2015-09-26,32,Male,United States,Yes,Yes,Often,Yes,Yes,Yes,...,No,No,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No
1045,2015-11-07,34,Male,United States,Yes,Yes,Sometimes,No,Yes,Yes,...,No,No,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No
1046,2015-11-30,46,Female,United States,No,No,,Yes,Yes,No,...,No,No,Don't know,Don't know,Yes,No,No,No,No,No


In [14]:
spark.stop()