In [23]:
pip install delta-spark==2.4.0

Note: you may need to restart the kernel to use updated packages.


In [24]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, DoubleType

In [25]:
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.TrabalhoPratico.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
hdfs_path = "hdfs://hdfs-nn:9000/TrabalhoPratico/bronze/Gender_StatsCountry.csv"
#define the schema for the dataframe
customSchema = StructType([
    StructField("CountryCode", StringType(), True),        
    StructField("ShortName", StringType(), True),
    StructField("TableName", StringType(), True),
    StructField("LongName", StringType(), True),
    StructField("2AplhaCode", StringType(), True),
    StructField("CurrencyUnit", StringType(), True),
    StructField("SpecialNotes", StringType(), True),
    StructField("Region", StringType(), True),
    StructField("IncomeGroup", StringType(), True),
    StructField("WB2Code", StringType(), True),
    StructField("NationalAccountsBaseYear", StringType(), True),
    StructField("NationalAccountsReferenceYear", StringType(), True), 
    StructField("SNAPriceValuation", StringType(), True),
    StructField("LendingCategory", StringType(), True),
    StructField("OtherGroups", StringType(), True),
    StructField("SystemOfNationalAccounts", StringType(), True),
    StructField("AlternativeConversionFactor", StringType(), True),
    StructField("PPPSurveyYear", StringType(), True),
    StructField("BalanceOfPaymentsManualInUse", StringType(), True), 
    StructField("ExternalDebtReportingStatus", StringType(), True),
    StructField("SystemOfTrade", StringType(), True),
    StructField("GovernmentAccountingConcenpt", StringType(), True),
    StructField("IMFDataDisseminationStandard", StringType(), True),
    StructField("LatestPopulationCensus", StringType(), True),
    StructField("LatestHouseholdSurvey", StringType(), True),
    StructField("SourceOfMostIncome", StringType(), True),
    StructField("VitalResgistrationComplete", StringType(), True),
    StructField("LatestAgriculturalCensus", StringType(), True),
    StructField("LatestIndustrialData", StringType(), True),
    StructField("LatestTradeData", StringType(), True),
    ])

genderStatsCountry_DeltaTable = spark \
           .read \
        .option("header","true") \
        .option("delimiter",",") \
        .option("escapeQuotes","false") \
        .option("multiline","true") \
            .schema(customSchema) \
            .csv(hdfs_path)
genderStatsCountry_DeltaTable.toPandas()
genderStatsCountry_DeltaTable.printSchema()
    

In [27]:
genderStatsCountry_DeltaTable.toPandas()

Unnamed: 0,CountryCode,ShortName,TableName,LongName,2AplhaCode,CurrencyUnit,SpecialNotes,Region,IncomeGroup,WB2Code,...,SystemOfTrade,GovernmentAccountingConcenpt,IMFDataDisseminationStandard,LatestPopulationCensus,LatestHouseholdSurvey,SourceOfMostIncome,VitalResgistrationComplete,LatestAgriculturalCensus,LatestIndustrialData,LatestTradeData
0,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,,Latin America & Caribbean,High income,AW,...,General trade system,,Enhanced General Data Dissemination System (e-...,2020 (expected),,,Yes,,,2018
1,AFE,Africa Eastern and Southern,Africa Eastern and Southern,Africa Eastern and Southern,ZH,,"26 countries, stretching from the Red Sea in t...",,,ZH,...,,,,,,,,,,
2,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,The reporting period for national accounts dat...,South Asia,Low income,AF,...,General trade system,Consolidated central government,Enhanced General Data Dissemination System (e-...,1979,"Demographic and Health Survey, 2015","Integrated household survey (IHS), 2016/17",,,,2018
3,AFW,Africa Western and Central,Africa Western and Central,Africa Western and Central,ZI,,"22 countries, stretching from the westernmost ...",,,ZI,...,,,,,,,,,,
4,AGO,Angola,Angola,People's Republic of Angola,AO,Angolan kwanza,The World Bank systematically assesses the app...,Sub-Saharan Africa,Lower middle income,AO,...,General trade system,Budgetary central government,Enhanced General Data Dissemination System (e-...,2014,"Demographic and Health Survey, 2015/16","Integrated household survey (IHS), 2008/09",,,,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,XKX,Kosovo,Kosovo,Republic of Kosovo,XK,Euro,,Europe & Central Asia,Upper middle income,XK,...,,,Enhanced General Data Dissemination System (e-...,2011,"Multiple Indicator Cluster Survey, 2019/20","Expenditure survey/budget survey (ES/BS), 2015",,2014,,
261,YEM,Yemen,"Yemen, Rep.",Republic of Yemen,YE,Yemeni rial,The World Bank systematically assesses the app...,Middle East & North Africa,Low income,RY,...,Special trade system,,Enhanced General Data Dissemination System (e-...,2004,"Demographic and Health Survey, 2013","Expenditure survey/budget survey (ES/BS), 2014",,,2012,2015
262,ZAF,South Africa,South Africa,Republic of South Africa,ZA,South African rand,Fiscal year end: March 31; reporting period fo...,Sub-Saharan Africa,Upper middle income,ZA,...,General trade system,Consolidated central government,Special Data Dissemination Standard (SDDS),2011,"Demographic and Health Survey, 2016","Expenditure survey/budget survey (ES/BS), 2014/15",,2007,2010,2018
263,ZMB,Zambia,Zambia,Republic of Zambia,ZM,New Zambian kwacha,National accounts data were rebased to reflect...,Sub-Saharan Africa,Lower middle income,ZM,...,General trade system,Budgetary central government,Enhanced General Data Dissemination System (e-...,2020 (expected),"Demographic and Health Survey, 2018","Integrated household survey (IHS), 2015",,,1994,2018


In [28]:
genderStatsCountry_DeltaTable=genderStatsCountry_DeltaTable.drop("AlternativeConversionFactor","PPPSurveyYear")

In [29]:
genderStatsCountry_DeltaTable.toPandas()

Unnamed: 0,CountryCode,ShortName,TableName,LongName,2AplhaCode,CurrencyUnit,SpecialNotes,Region,IncomeGroup,WB2Code,...,SystemOfTrade,GovernmentAccountingConcenpt,IMFDataDisseminationStandard,LatestPopulationCensus,LatestHouseholdSurvey,SourceOfMostIncome,VitalResgistrationComplete,LatestAgriculturalCensus,LatestIndustrialData,LatestTradeData
0,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,,Latin America & Caribbean,High income,AW,...,General trade system,,Enhanced General Data Dissemination System (e-...,2020 (expected),,,Yes,,,2018
1,AFE,Africa Eastern and Southern,Africa Eastern and Southern,Africa Eastern and Southern,ZH,,"26 countries, stretching from the Red Sea in t...",,,ZH,...,,,,,,,,,,
2,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,The reporting period for national accounts dat...,South Asia,Low income,AF,...,General trade system,Consolidated central government,Enhanced General Data Dissemination System (e-...,1979,"Demographic and Health Survey, 2015","Integrated household survey (IHS), 2016/17",,,,2018
3,AFW,Africa Western and Central,Africa Western and Central,Africa Western and Central,ZI,,"22 countries, stretching from the westernmost ...",,,ZI,...,,,,,,,,,,
4,AGO,Angola,Angola,People's Republic of Angola,AO,Angolan kwanza,The World Bank systematically assesses the app...,Sub-Saharan Africa,Lower middle income,AO,...,General trade system,Budgetary central government,Enhanced General Data Dissemination System (e-...,2014,"Demographic and Health Survey, 2015/16","Integrated household survey (IHS), 2008/09",,,,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,XKX,Kosovo,Kosovo,Republic of Kosovo,XK,Euro,,Europe & Central Asia,Upper middle income,XK,...,,,Enhanced General Data Dissemination System (e-...,2011,"Multiple Indicator Cluster Survey, 2019/20","Expenditure survey/budget survey (ES/BS), 2015",,2014,,
261,YEM,Yemen,"Yemen, Rep.",Republic of Yemen,YE,Yemeni rial,The World Bank systematically assesses the app...,Middle East & North Africa,Low income,RY,...,Special trade system,,Enhanced General Data Dissemination System (e-...,2004,"Demographic and Health Survey, 2013","Expenditure survey/budget survey (ES/BS), 2014",,,2012,2015
262,ZAF,South Africa,South Africa,Republic of South Africa,ZA,South African rand,Fiscal year end: March 31; reporting period fo...,Sub-Saharan Africa,Upper middle income,ZA,...,General trade system,Consolidated central government,Special Data Dissemination Standard (SDDS),2011,"Demographic and Health Survey, 2016","Expenditure survey/budget survey (ES/BS), 2014/15",,2007,2010,2018
263,ZMB,Zambia,Zambia,Republic of Zambia,ZM,New Zambian kwacha,National accounts data were rebased to reflect...,Sub-Saharan Africa,Lower middle income,ZM,...,General trade system,Budgetary central government,Enhanced General Data Dissemination System (e-...,2020 (expected),"Demographic and Health Survey, 2018","Integrated household survey (IHS), 2015",,,1994,2018


In [30]:
genderStatsCountry_DeltaTable \
    .select("CountryCode","ShortName", "TableName", "LongName", "2AplhaCode", "CurrencyUnit", "SpecialNotes", "Region", "IncomeGroup", "WB2Code", "NationalAccountsBaseYear", "NationalAccountsReferenceYear", "SNAPriceValuation", "LendingCategory", "OtherGroups", "SystemOfNationalAccounts", "BalanceOfPaymentsManualInUse", "ExternalDebtReportingStatus", "SystemOfTrade", "GovernmentAccountingConcenpt", "IMFDataDisseminationStandard", "LatestPopulationCensus", "LatestHouseholdSurvey", "SourceOfMostIncome", "VitalResgistrationComplete", "LatestAgriculturalCensus", "LatestIndustrialData","LatestTradeData" ) \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/genderStatsCountry_DeltaTable")