In [8]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import WindowSpec
from pyspark.sql.window import Window

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.TrabalhoPratico.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
spark.sql(
     """
     Show tables from silver
     """
).show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|   silver|genderstatsdata_d...|      false|
+---------+--------------------+-----------+



In [4]:
#read hdfs file to dataframe
#
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPratico/bronze/genderStatsData"
#define the schema for the dataframe
customSchema = StructType([
    StructField("countryName", StringType(), False),        
    StructField("countryCode", StringType(), False),
    StructField("indicatorName", StringType(), False),
    StructField("indicatorCode", StringType(), False),
    StructField("y1960", StringType(), True),
    StructField("y1961", StringType(), True),
    StructField("y1962", StringType(), True),
    StructField("y1963", StringType(), True),
    StructField("y1964", StringType(), True),
    StructField("y1965", StringType(), True),
    StructField("y1966", StringType(), True),
    StructField("y1967", StringType(), True),
    StructField("y1968", StringType(), True),
    StructField("y1969", StringType(), True),
    StructField("y1970", StringType(), True),
    StructField("y1971", StringType(), True),
    StructField("y1972", StringType(), True),
    StructField("y1973", StringType(), True),
    StructField("y1974", StringType(), True),
    StructField("y1975", StringType(), True),
    StructField("y1976", StringType(), True),
    StructField("y1977", StringType(), True),
    StructField("y1978", StringType(), True),
    StructField("y1979", StringType(), True),
    StructField("y1980", StringType(), True),
    StructField("y1981", StringType(), True),
    StructField("y1982", StringType(), True),
    StructField("y1983", StringType(), True),
    StructField("y1984", StringType(), True),
    StructField("y1985", StringType(), True),
    StructField("y1986", StringType(), True),
    StructField("y1987", StringType(), True),
    StructField("y1988", StringType(), True),
    StructField("y1989", StringType(), True),
    StructField("y1990", StringType(), True),
    StructField("y1991", StringType(), True),
    StructField("y1992", StringType(), True),
    StructField("y1993", StringType(), True),
    StructField("y1994", StringType(), True),
    StructField("y1995", StringType(), True),
    StructField("y1996", StringType(), True),
    StructField("y1997", StringType(), True),
    StructField("y1998", StringType(), True),
    StructField("y1999", StringType(), True),
    StructField("y2000", StringType(), True),
    StructField("y2001", StringType(), True),
    StructField("y2002", StringType(), True),
    StructField("y2003", StringType(), True),
    StructField("y2004", StringType(), True),
    StructField("y2005", StringType(), True),
    StructField("y2006", StringType(), True),
    StructField("y2007", StringType(), True),
    StructField("y2008", StringType(), True),
    StructField("y2009", StringType(), True),
    StructField("y2010", StringType(), True),
    StructField("y2011", StringType(), True),
    StructField("y2012", StringType(), True),
    StructField("y2013", StringType(), True),
    StructField("y2014", StringType(), True),
    StructField("y2015", StringType(), True),
    StructField("y2016", StringType(), True),
    StructField("y2017", StringType(), True),
    StructField("y2018", StringType(), True),
    StructField("y2019", StringType(), True),
    StructField("y2020", StringType(), True),
    StructField("y2021", StringType(), True),
    StructField("y2022", StringType(), True)
])

genderStatsData_dt = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)

genderStatsData_dt.show()

+--------------------+-----------+--------------------+-----------------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------+----------------+----------------+----------------+----------------+--------------+----------------+----------------+-----+
|         countryName|countryCode|       indicatorName|    indicatorCode|y1960|y1961|y1962|y1963|y1964|y1965|y1966|y1967|y1968|y1969|y1970|y1971|y1972|y1973|y1974|y1975|y1976|y1977|y1978|y1979|y1980|y1981|y1982|y1983|y1984|y1985|y1986|y1987|y1988|y1989|y1990|y1991|y1992|y1993|y1994|y1995|y1996|y1997|y1998|y1999|y20

In [9]:
import pandas as pd

#apagar colunas do Data Frame
genderStatsData_dt.drop('y1960', 'y1961', 'y1962', 'y1963', 'y1964', 'y1965', 'y1966', 'y1967', 'y1968', 'y1969','y1970', 'y1971', 'y1972', 'y1973', 'y1974', 'y1975', 'y1976', 'y1977', 'y1978', 'y1979','y1980', 'y1981', 'y1982', 'y1983', 'y1984', 'y1985', 'y1986', 'y1987', 'y1988', 'y1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999')

#Drop linhas a null
#genderStatsData_fn = genderStatsData_dt.dropna()

#genderStatsData_fn.toPandas()
genderStatsData_dt.toPandas()

Unnamed: 0,countryName,countryCode,indicatorName,indicatorCode,y1960,y1961,y1962,y1963,y1964,y1965,...,y2013,y2014,y2015,y2016,y2017,y2018,y2019,y2020,y2021,y2022
0,Africa Eastern and Southern,AFE,A woman can apply for a passport in the same w...,SG.APL.PSPT.EQ,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,A woman can be head of household in the same w...,SG.HLD.HEAD.EQ,,,,,,,...,,,,,,,,,,
2,Africa Eastern and Southern,AFE,A woman can choose where to live in the same w...,SG.LOC.LIVE.EQ,,,,,,,...,,,,,,,,,,
3,Africa Eastern and Southern,AFE,A woman can get a job in the same way as a man...,SG.GET.JOBS.EQ,,,,,,,...,,,,,,,,,,
4,Africa Eastern and Southern,AFE,A woman can obtain a judgment of divorce in th...,SG.OBT.DVRC.EQ,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305540,Zimbabwe,ZWE,Worried about not having enough money for old ...,fin44a1.d.2,,,,,,,...,,,,,,,,,68.02,
305541,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, % fe...",UIS.LPP.AG15T24,,,,,,,...,,37.14571,,,,,,,,
305542,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, both...",UIS.LP.AG15T24,,,,,,,...,,267220,,,,,,,,
305543,Zimbabwe,ZWE,"Youth illiterate population, 15-24 years, fema...",UIS.LP.AG15T24.F,,,,,,,...,,99261,,,,,,,,


In [15]:
genderStatsData_final \
    .select("countryName","countryCode", "indicatorName", "indicatorCode", "y2000", "y2001", "y2002", "y2003", "y2004", "y2005", "y2006", "y2007", "y2008", "y2009", "y2010", "y2011", "y2012", "y2013", "y2014", "y2015", "y2016", "y2017", "y2018", "y2019", "y2020", "y2021", "y2022" ) \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/genderStatsData_DeltaTable") 