In [1]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("delta").master("spark://spark-master:7077") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.access.key","datalake") \
    .config("spark.hadoop.fs.s3a.secret.key","datalake") \
    .config("spark.hadoop.fs.s3a.endpoint","http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")


spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

In [2]:
spark

In [3]:
df = spark.read.csv("s3a://delta-lake/results.csv",header=True)

In [4]:
df = df.withColumnRenamed('date','dt')

In [5]:
df.show()

+----------+----------------+-------------+--------------+--------------+--------------+---------+-------------------+-----------+
|        dt|   home_teamName|away_teamName|home_scoreHome|away_scoreAway|tournamentName| cityCity|     countryCountry|neutralTRUE|
+----------+----------------+-------------+--------------+--------------+--------------+---------+-------------------+-----------+
|1872-11-30|        Scotland|      England|             0|             0|      Friendly|  Glasgow|           Scotland|      FALSE|
|1873-03-08|         England|     Scotland|             4|             2|      Friendly|   London|            England|      FALSE|
|1874-03-07|        Scotland|      England|             2|             1|      Friendly|  Glasgow|           Scotland|      FALSE|
|1875-03-06|         England|     Scotland|             2|             2|      Friendly|   London|            England|      FALSE|
|1876-03-04|        Scotland|      England|             3|             0|      Frie

In [8]:
import uuid

myuuid = uuid.uuid4()

path = ("s3a://delta-lake/demo/{}").format(myuuid)
print(path)

s3a://delta-lake/demo/bf33ca50-c826-4234-a055-a185c76acfab


In [9]:
df.write.format("delta").mode('overwrite').save(path)

In [7]:
df.printSchema()

root
 |-- dt: string (nullable = true)
 |-- home_teamName: string (nullable = true)
 |-- away_teamName: string (nullable = true)
 |-- home_scoreHome: string (nullable = true)
 |-- away_scoreAway: string (nullable = true)
 |-- tournamentName: string (nullable = true)
 |-- cityCity: string (nullable = true)
 |-- countryCountry: string (nullable = true)
 |-- neutralTRUE: string (nullable = true)



In [8]:
deltaTable = DeltaTable.forPath(spark, "s3a://delta-lake/demo4")

In [9]:
deltaTable.toDF().printSchema()

root
 |-- dt: string (nullable = true)
 |-- home_teamName: string (nullable = true)
 |-- away_teamName: string (nullable = true)
 |-- home_scoreHome: string (nullable = true)
 |-- away_scoreAway: string (nullable = true)
 |-- tournamentName: string (nullable = true)
 |-- cityCity: string (nullable = true)
 |-- countryCountry: string (nullable = true)
 |-- neutralTRUE: string (nullable = true)



In [9]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = "countryCountry == 'Brazil'",
  set = { "countryCountry": expr("BR") })

# Delete every even value
deltaTable.delete(condition = expr("date == '1872-11-30'"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.countryCountry = newData.countryCountry") \
  .whenMatchedUpdate(set = { "id": col("newData.countryCountry") }) \
  .whenNotMatchedInsert(values = { "countryCountry": col("newData.countryCountry") }) \
  .execute()

deltaTable.toDF().show()

AnalysisException: Column 'BR' does not exist. Did you mean one of the following? [date, cityCity, neutralTRUE, away_teamName, home_teamName, away_scoreAway, countryCountry, home_scoreHome, tournamentName]; line 1 pos 0;
'DeltaUpdateTable [countryCountry#1550], ['BR], (countryCountry#1550 = Brazil)
+- Relation [date#1543,home_teamName#1544,away_teamName#1545,home_scoreHome#1546,away_scoreAway#1547,tournamentName#1548,cityCity#1549,countryCountry#1550,neutralTRUE#1551] parquet


In [6]:
data = spark.range(0, 5)
data.write.format("delta").save("s3a://tmp/delta-table")

ConnectionRefusedError: [Errno 111] Connection refused

In [3]:
spark.sql('show databases').show()


+---------+
|namespace|
+---------+
|  default|
+---------+

