In [1]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("delta").master("spark://spark-master:7077") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.access.key","datalake") \
    .config("spark.hadoop.fs.s3a.secret.key","datalake") \
    .config("spark.hadoop.fs.s3a.endpoint","http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

In [2]:
spark

In [3]:
df = spark.read.csv("s3a://delta-lake/results.csv",header=True)

In [4]:
df = df.withColumnRenamed('date','dt')

In [5]:
df.show()

+----------+----------------+-------------+--------------+--------------+--------------+---------+-------------------+-----------+
|        dt|   home_teamName|away_teamName|home_scoreHome|away_scoreAway|tournamentName| cityCity|     countryCountry|neutralTRUE|
+----------+----------------+-------------+--------------+--------------+--------------+---------+-------------------+-----------+
|1872-11-30|        Scotland|      England|             0|             0|      Friendly|  Glasgow|           Scotland|      FALSE|
|1873-03-08|         England|     Scotland|             4|             2|      Friendly|   London|            England|      FALSE|
|1874-03-07|        Scotland|      England|             2|             1|      Friendly|  Glasgow|           Scotland|      FALSE|
|1875-03-06|         England|     Scotland|             2|             2|      Friendly|   London|            England|      FALSE|
|1876-03-04|        Scotland|      England|             3|             0|      Frie

In [6]:
import uuid
myuuid = uuid.uuid4()
path = ("s3a://delta-lake/demo/{}").format(myuuid)
print(path)

s3a://delta-lake/demo/4b59799f-d8e8-486d-b201-4cc6a4ae0666


In [7]:
df.write.format("delta").mode('overwrite').save(path)

In [8]:
df.printSchema()

root
 |-- dt: string (nullable = true)
 |-- home_teamName: string (nullable = true)
 |-- away_teamName: string (nullable = true)
 |-- home_scoreHome: string (nullable = true)
 |-- away_scoreAway: string (nullable = true)
 |-- tournamentName: string (nullable = true)
 |-- cityCity: string (nullable = true)
 |-- countryCountry: string (nullable = true)
 |-- neutralTRUE: string (nullable = true)



In [10]:
deltaTable = DeltaTable.forPath(spark, path)

In [18]:
type(deltaTable)

delta.tables.DeltaTable

In [15]:
d = spark.read.format("delta").load(path)

In [16]:
d.show()

+----------+----------------+-------------+--------------+--------------+--------------+---------+-------------------+-----------+
|        dt|   home_teamName|away_teamName|home_scoreHome|away_scoreAway|tournamentName| cityCity|     countryCountry|neutralTRUE|
+----------+----------------+-------------+--------------+--------------+--------------+---------+-------------------+-----------+
|1872-11-30|        Scotland|      England|             0|             0|      Friendly|  Glasgow|           Scotland|      FALSE|
|1873-03-08|         England|     Scotland|             4|             2|      Friendly|   London|            England|      FALSE|
|1874-03-07|        Scotland|      England|             2|             1|      Friendly|  Glasgow|           Scotland|      FALSE|
|1875-03-06|         England|     Scotland|             2|             2|      Friendly|   London|            England|      FALSE|
|1876-03-04|        Scotland|      England|             3|             0|      Frie

In [17]:
type(d)

pyspark.sql.dataframe.DataFrame

In [11]:
deltaTable.toDF().printSchema()

root
 |-- dt: string (nullable = true)
 |-- home_teamName: string (nullable = true)
 |-- away_teamName: string (nullable = true)
 |-- home_scoreHome: string (nullable = true)
 |-- away_scoreAway: string (nullable = true)
 |-- tournamentName: string (nullable = true)
 |-- cityCity: string (nullable = true)
 |-- countryCountry: string (nullable = true)
 |-- neutralTRUE: string (nullable = true)



In [20]:
d.filter(d.countryCountry == 'Brazil').show()

+----------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+-----------+
|        dt|home_teamName|away_teamName|home_scoreHome|away_scoreAway|tournamentName|      cityCity|countryCountry|neutralTRUE|
+----------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+-----------+
|1919-05-11|       Brazil|        Chile|             6|             0|  Copa América|Rio de Janeiro|        Brazil|      FALSE|
|1919-05-13|    Argentina|      Uruguay|             2|             3|  Copa América|Rio de Janeiro|        Brazil|       TRUE|
|1919-05-17|        Chile|      Uruguay|             0|             2|  Copa América|Rio de Janeiro|        Brazil|       TRUE|
|1919-05-18|       Brazil|    Argentina|             3|             1|  Copa América|Rio de Janeiro|        Brazil|      FALSE|
|1919-05-22|    Argentina|        Chile|             4|             1|  Copa América|Rio de Janeiro|    

In [23]:
code = spark.read.csv('s3a://camada-bronze/country_code.csv',header=True)

In [24]:
code.show()

+-------------------+-------+-------+------------+-------------+--------+--------------------+-------------------+-----------+---------------+------------------------+
|               name|alpha-2|alpha-3|country-code|   iso_3166-2|  region|          sub-region|intermediate-region|region-code|sub-region-code|intermediate-region-code|
+-------------------+-------+-------+------------+-------------+--------+--------------------+-------------------+-----------+---------------+------------------------+
|        Afghanistan|     AF|    AFG|         004|ISO 3166-2:AF|    Asia|       Southern Asia|               null|        142|            034|                    null|
|      Åland Islands|     AX|    ALA|         248|ISO 3166-2:AX|  Europe|     Northern Europe|               null|        150|            154|                    null|
|            Albania|     AL|    ALB|         008|ISO 3166-2:AL|  Europe|     Southern Europe|               null|        150|            039|                  

In [26]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable.alias("delta") \
  .merge(
    code.alias("code"),
    "delta.countryCountry = code.name") \
  .whenNotMatchedInsert(values = { "code": col("code.alpha-3") }) \
  .execute()

AnalysisException: cannot resolve code in INSERT clause given columns delta.dt, delta.home_teamName, delta.away_teamName, delta.home_scoreHome, delta.away_scoreAway, delta.tournamentName, delta.cityCity, delta.countryCountry, delta.neutralTRUE

In [9]:
from delta.tables import *
from pyspark.sql.functions import *

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("delta") \
  .merge(
    code.alias("code"),
    "delta.countryCountry = code.name") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()

AnalysisException: Column 'BR' does not exist. Did you mean one of the following? [date, cityCity, neutralTRUE, away_teamName, home_teamName, away_scoreAway, countryCountry, home_scoreHome, tournamentName]; line 1 pos 0;
'DeltaUpdateTable [countryCountry#1550], ['BR], (countryCountry#1550 = Brazil)
+- Relation [date#1543,home_teamName#1544,away_teamName#1545,home_scoreHome#1546,away_scoreAway#1547,tournamentName#1548,cityCity#1549,countryCountry#1550,neutralTRUE#1551] parquet


In [6]:
data = spark.range(0, 5)
data.write.format("delta").save("s3a://tmp/delta-table")

ConnectionRefusedError: [Errno 111] Connection refused

In [3]:
spark.sql('show databases').show()


+---------+
|namespace|
+---------+
|  default|
+---------+

