In [9]:
pip install delta-spark==2.4.0

Note: you may need to restart the kernel to use updated packages.


In [10]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import *

In [11]:
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [12]:
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPratico/bronze/StudentMentalHealth"

from pyspark.sql.types import TimestampType
#define the schema for the dataframe
customSchema = StructType([
     StructField("TimeStamp", StringType(), True),     
    StructField("Choose_your_gender", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("What_is_your_course", StringType(), True),
    StructField("Your_current_year_of_study", StringType(), True),
    StructField("What_is_your_GPA", StringType(), True),
    StructField("Marital_status", StringType(), True),
    StructField("Do_you_have_depression", StringType(), True),
    StructField("Do_you_have_anxiety", StringType(), True),
    StructField("Do_you_have_panic_attack", StringType(), True),
    StructField("Did_you_seek_any_specialist_for_treatment", StringType(), True)
])
StudentmentalHealth_df = spark \
            .read\
            .option("delimiter",";")\
            .option("header","true")\
            .option("dateFormat", "dd/MM/yyyy HH:mm")\
            .schema(customSchema) \
            .csv(hdfs_path)
StudentmentalHealth_df.toPandas()
StudentmentalHealth_df.printSchema()

root
 |-- TimeStamp: string (nullable = true)
 |-- Choose_your_gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- What_is_your_course: string (nullable = true)
 |-- Your_current_year_of_study: string (nullable = true)
 |-- What_is_your_GPA: string (nullable = true)
 |-- Marital_status: string (nullable = true)
 |-- Do_you_have_depression: string (nullable = true)
 |-- Do_you_have_anxiety: string (nullable = true)
 |-- Do_you_have_panic_attack: string (nullable = true)
 |-- Did_you_seek_any_specialist_for_treatment: string (nullable = true)



In [13]:
StudentmentalHealth_df.toPandas()

Unnamed: 0,TimeStamp,Choose_your_gender,Age,What_is_your_course,Your_current_year_of_study,What_is_your_GPA,Marital_status,Do_you_have_depression,Do_you_have_anxiety,Do_you_have_panic_attack,Did_you_seek_any_specialist_for_treatment
0,08/07/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,08/07/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,08/07/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,08/07/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,08/07/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...
96,13/07/2020 19:56,Female,21.0,BCS,year 1,3.50 - 4.00,No,No,Yes,No,No
97,13/07/2020 21:21,Male,18.0,Engineering,Year 2,3.00 - 3.49,No,Yes,Yes,No,No
98,13/07/2020 21:22,Female,19.0,Nursing,Year 3,3.50 - 4.00,Yes,Yes,No,Yes,No
99,13/07/2020 21:23,Female,23.0,Pendidikan Islam,year 4,3.50 - 4.00,No,No,No,No,No


In [14]:
#Em Python
# Para por valores vazios em null na coluna "Age"
StudentmentalHealth_df = StudentmentalHealth_df.withColumn("Age", when(col("Age") == '', None).otherwise(col("Age")))

StudentmentalHealth_df.show()

+----------------+------------------+---+-------------------+--------------------------+----------------+--------------+----------------------+-------------------+------------------------+-----------------------------------------+
|       TimeStamp|Choose_your_gender|Age|What_is_your_course|Your_current_year_of_study|What_is_your_GPA|Marital_status|Do_you_have_depression|Do_you_have_anxiety|Do_you_have_panic_attack|Did_you_seek_any_specialist_for_treatment|
+----------------+------------------+---+-------------------+--------------------------+----------------+--------------+----------------------+-------------------+------------------------+-----------------------------------------+
|08/07/2020 12:02|            Female| 18|        Engineering|                    year 1|     3.00 - 3.49|            No|                   Yes|                 No|                     Yes|                                       No|
|08/07/2020 12:04|              Male| 21|  Islamic education|               

In [None]:
StudentmentalHealth_df\
    .select("TimeStamp", "Choose_your_gender","Age","What_is_your_course", "Your_current_year_of_study", "What_is_your_GPA", "Marital_status" , "Do_you_have_depression", "Do_you_have_anxiety", "Do_you_have_panic_attack", "Did_you_seek_any_specialist_for_treatment")\
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/StudentMentalHealth_DeltaTable/")

In [None]:
spark.stop()