In [14]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import *

In [15]:
warehouse_location = 'hdfs://hdfs-nn:9000/TrabalhoPratico'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [16]:
Gender_StatsData = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/genderStatsCountry_DeltaTable")

In [17]:
Gender_StatsCountry = spark.read.format("delta").load("hdfs://hdfs-nn:9000/TrabalhoPratico/silver/genderStatsData_DeltaTable")

In [18]:
PaisEAnoData = Gender_StatsData.select("CountryCode","ShortName")

PaisEAnoCountry = Gender_StatsCountry.select("countryName","countryCode")

PaisEAnoData.show()

PaisEAnoCountry.show()

+-----------+--------------------+
|CountryCode|           ShortName|
+-----------+--------------------+
|        ABW|               Aruba|
|        AFE|Africa Eastern an...|
|        AFG|         Afghanistan|
|        AFW|Africa Western an...|
|        AGO|              Angola|
|        ALB|             Albania|
|        AND|             Andorra|
|        ARB|          Arab World|
|        ARE|United Arab Emirates|
|        ARG|           Argentina|
|        ARM|             Armenia|
|        ASM|      American Samoa|
|        ATG| Antigua and Barbuda|
|        AUS|           Australia|
|        AUT|             Austria|
|        AZE|          Azerbaijan|
|        BDI|             Burundi|
|        BEL|             Belgium|
|        BEN|               Benin|
|        BFA|        Burkina Faso|
+-----------+--------------------+
only showing top 20 rows

+--------------------+-----------+
|         countryName|countryCode|
+--------------------+-----------+
|Africa Eastern an...|       

In [19]:
joined_df = PaisEAnoData.join(
    PaisEAnoCountry,
    (PaisEAnoData["CountryCode"] == PaisEAnoCountry["countryCode"]) &
    (PaisEAnoData["ShortName"] == PaisEAnoCountry["countryName"]),
    "inner"
)

# Select the desired columns after the join
result_df = joined_df.select(
    PaisEAnoData["CountryCode"].alias("Code"),
    PaisEAnoData["ShortName"].alias("Name")
)

# Show the resulting DataFrame
result_df.show(100)

+----+--------------------+
|Code|                Name|
+----+--------------------+
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern an...|
| AFE|Africa Eastern

In [20]:
filtered_df = result_df.filter(result_df["Name"].like("%Europe%"))

# Show the countries with "Europe" in their name
filtered_df.show()

+----+--------------------+
|Code|                Name|
+----+--------------------+
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
| CEB|Central Europe an...|
+----+--------------------+
only showing top 20 rows



In [13]:
spark.stop()