In [None]:
pip install delta-spark==2.4.0

In [1]:

from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, DoubleType

In [2]:
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.TrabalhoPratico.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
#read hdfs file to dataframe
#
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPratico/bronze/GenderInequalityIndex"
#define the schema for the dataframe
customSchema = StructType([
    StructField("ISO3", StringType(), True),        
    StructField("Pais", StringType(), True),
    StructField("Continente", StringType(), True),
    StructField("Hemisferio", StringType(), True),
    StructField("Desenvolvimento_Humano", StringType(), True),
    StructField("UNDP_Regions", StringType(), True),
    StructField("UNDP_rank", IntegerType(), True),
    StructField("GII_rank", IntegerType(), True),
    StructField("Index_1990", DoubleType(), True),
    StructField("Index_1991", DoubleType(), True),
    StructField("Index_1992", DoubleType(), True),
    StructField("Index_1993", DoubleType(), True),
    StructField("Index_1994", DoubleType(), True),
    StructField("Index_1995", DoubleType(), True),
    StructField("Index_1996", DoubleType(), True),
    StructField("Index_1997", DoubleType(), True),
    StructField("Index_1998", DoubleType(), True),
    StructField("Index_1999", DoubleType(), True),
    StructField("Index_2000", DoubleType(), True),
    StructField("Index_2001", DoubleType(), True),
    StructField("Index_2002", DoubleType(), True),
    StructField("Index_2003", DoubleType(), True),
    StructField("Index_2004", DoubleType(), True),
    StructField("Index_2005", DoubleType(), True),
    StructField("Index_2006", DoubleType(), True),
    StructField("Index_2007", DoubleType(), True),
    StructField("Index_2008", DoubleType(), True),
    StructField("Index_2009", DoubleType(), True),
    StructField("Index_2010", DoubleType(), True),
    StructField("Index_2011", DoubleType(), True),
    StructField("Index_2012", DoubleType(), True),
    StructField("Index_2013", DoubleType(), True),
    StructField("Index_2014", DoubleType(), True),
    StructField("Index_2015", DoubleType(), True),
    StructField("Index_2016", DoubleType(), True),
    StructField("Index_2017", DoubleType(), True),
    StructField("Index_2018", DoubleType(), True),
    StructField("Index_2019", DoubleType(), True),
    StructField("Index_2020", DoubleType(), True),
    StructField("Index_2021", DoubleType(), True)
])

GenderInequalityIndex_DeltaTable = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
GenderInequalityIndex_DeltaTable.toPandas()
GenderInequalityIndex_DeltaTable.printSchema()

root
 |-- ISO3: string (nullable = true)
 |-- Pais: string (nullable = true)
 |-- Continente: string (nullable = true)
 |-- Hemisferio: string (nullable = true)
 |-- Desenvolvimento_Humano: string (nullable = true)
 |-- UNDP_Regions: string (nullable = true)
 |-- UNDP_rank: integer (nullable = true)
 |-- GII_rank: integer (nullable = true)
 |-- Index_1990: double (nullable = true)
 |-- Index_1991: double (nullable = true)
 |-- Index_1992: double (nullable = true)
 |-- Index_1993: double (nullable = true)
 |-- Index_1994: double (nullable = true)
 |-- Index_1995: double (nullable = true)
 |-- Index_1996: double (nullable = true)
 |-- Index_1997: double (nullable = true)
 |-- Index_1998: double (nullable = true)
 |-- Index_1999: double (nullable = true)
 |-- Index_2000: double (nullable = true)
 |-- Index_2001: double (nullable = true)
 |-- Index_2002: double (nullable = true)
 |-- Index_2003: double (nullable = true)
 |-- Index_2004: double (nullable = true)
 |-- Index_2005: double (nul

In [6]:
GenderInequalityIndex_DeltaTable.toPandas()

Unnamed: 0,ISO3,Pais,Continente,Hemisferio,Desenvolvimento_Humano,UNDP_Regions,UNDP_rank,GII_rank,Index_1990,Index_1991,...,Index_2012,Index_2013,Index_2014,Index_2015,Index_2016,Index_2017,Index_2018,Index_2019,Index_2020,Index_2021
0,AFG,Afghanistan,Asia,Northern Hemisphere,Low,SA,180.0,167.0,,,...,0.738,0.728,0.718,0.706,0.692,0.678,0.671,0.665,0.674,0.678
1,AGO,Angola,Africa,Southern Hemisphere,Medium,SSA,148.0,136.0,0.725,0.723,...,0.545,0.540,0.531,0.530,0.529,0.538,0.537,0.537,0.537,0.537
2,ALB,Albania,Europe,Northern Hemisphere,High,ECA,67.0,39.0,,,...,0.235,0.225,0.219,0.204,0.191,0.170,0.164,0.156,0.156,0.144
3,AND,Andorra,Europe,Northern Hemisphere,Very High,,40.0,,,,...,,,,,,,,,,
4,ARE,United Arab Emirates,Asia,Northern Hemisphere,Very High,AS,26.0,11.0,0.659,0.647,...,0.171,0.161,0.151,0.126,0.118,0.112,0.103,0.056,0.050,0.049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,WSM,Samoa,Oceania,Southern Hemisphere,High,EAP,111.0,99.0,,,...,0.482,0.475,0.447,0.443,0.409,0.407,0.407,0.405,0.404,0.418
191,YEM,Yemen,Asia,Northern Hemisphere,Low,AS,183.0,170.0,,,...,0.800,0.802,0.806,0.815,0.810,0.808,0.806,0.785,0.784,0.820
192,ZAF,South Africa,Africa,Southern Hemisphere,High,SSA,109.0,97.0,0.511,0.502,...,0.432,0.431,0.433,0.427,0.418,0.407,0.405,0.410,0.408,0.405
193,ZMB,Zambia,Africa,Southern Hemisphere,Medium,SSA,154.0,138.0,0.666,0.660,...,0.585,0.581,0.579,0.567,0.545,0.540,0.537,0.534,0.535,0.540


In [7]:
df = GenderInequalityIndex_DeltaTable.dropna()

In [8]:
df.toPandas()

Unnamed: 0,ISO3,Pais,Continente,Hemisferio,Desenvolvimento_Humano,UNDP_Regions,UNDP_rank,GII_rank,Index_1990,Index_1991,...,Index_2012,Index_2013,Index_2014,Index_2015,Index_2016,Index_2017,Index_2018,Index_2019,Index_2020,Index_2021
0,AGO,Angola,Africa,Southern Hemisphere,Medium,SSA,148,136,0.725,0.723,...,0.545,0.540,0.531,0.530,0.529,0.538,0.537,0.537,0.537,0.537
1,ARE,United Arab Emirates,Asia,Northern Hemisphere,Very High,AS,26,11,0.659,0.647,...,0.171,0.161,0.151,0.126,0.118,0.112,0.103,0.056,0.050,0.049
2,ARG,Argentina,America,Southern Hemisphere,Very High,LAC,47,69,0.442,0.439,...,0.360,0.356,0.351,0.345,0.338,0.328,0.315,0.306,0.293,0.287
3,ARM,Armenia,Asia,Northern Hemisphere,High,ECA,85,53,0.470,0.468,...,0.327,0.305,0.313,0.309,0.306,0.261,0.260,0.239,0.239,0.216
4,AZE,Azerbaijan,Asia,Northern Hemisphere,High,ECA,91,70,0.334,0.343,...,0.340,0.342,0.345,0.335,0.328,0.311,0.307,0.314,0.301,0.294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,VEN,Venezuela,America,Northern Hemisphere,Medium,LAC,120,123,0.563,0.560,...,0.491,0.488,0.486,0.487,0.499,0.481,0.483,0.492,0.492,0.492
85,VNM,Viet Nam,Asia,Northern Hemisphere,High,EAP,115,71,0.410,0.409,...,0.319,0.320,0.317,0.316,0.306,0.303,0.304,0.305,0.305,0.296
86,ZAF,South Africa,Africa,Southern Hemisphere,High,SSA,109,97,0.511,0.502,...,0.432,0.431,0.433,0.427,0.418,0.407,0.405,0.410,0.408,0.405
87,ZMB,Zambia,Africa,Southern Hemisphere,Medium,SSA,154,138,0.666,0.660,...,0.585,0.581,0.579,0.567,0.545,0.540,0.537,0.534,0.535,0.540


In [9]:
df \
    .write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/GenderInequalityIndex_DeltaTable")

In [10]:
spark.stop()