# Convert from CSV to Delta Lake

In [1]:
from delta.tables import *
from pyspark.sql.functions import *
import pyspark
from delta import *

# Read Csv by pyspark
def Read_csv_Data(file_path, datatype):
    # Spark Session
    spark = (SparkSession
            .builder
            .appName("SparkSQLExampleApp")
            .getOrCreate())
    
    data_file = file_path + datatype

    df = (spark.read.format(datatype)
        .option("inferSchema", "true")
        .option("header", "true")
        .load(data_file))
    
    return df

def convert_csv_delta(df, save_delta_name):
    df.write.format("delta").save(save_delta_name)

### s3fs를 이용한 S3에서 DeltaTable 읽기

In [3]:
import s3fs
from deltalake import DeltaTable

fs = s3fs.S3FileSystem()

delta_table_path = f"s3a://donghee-deltalake-test/delta-table-quickstart"
delta_table = DeltaTable(delta_table_path, file_system=fs)
delta_table

<deltalake.deltatable.DeltaTable at 0x7f35f80ae550>

In [4]:
type(delta_table)

deltalake.deltatable.DeltaTable

### S3 - Time Travel

In [14]:
delta_table_version_1 = delta_table.as_version(1)
delta_table_version_3 = delta_table.as_version(3)

s3_df_1 = delta_table_version_1.to_pandas()
s3_df_2 = delta_table_version_3.to_pandas()

In [17]:
s3_df_1

Unnamed: 0,id
0,39
1,33
2,35
3,37
4,31


In [18]:
s3_df_2

Unnamed: 0,id
0,39
1,33
2,35
3,37
4,31


### local에서 DeltaTable 읽기

In [7]:
from delta.tables import *
from pyspark.sql import SparkSession

builder = pyspark.sql.SparkSession.builder.appName("MyApp2") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("WARN") # ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN

your 131072x1 screen size is bogus. expect trouble


24/02/15 02:35:07 WARN Utils: Your hostname, DESKTOP-JJQA3IT resolves to a loopback address: 127.0.1.1; using 172.25.190.30 instead (on interface eth0)
24/02/15 02:35:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/donghee/work/deltalake1/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/donghee/.ivy2/cache
The jars for the packages stored in: /home/donghee/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a2375b77-8fba-4bd1-b872-d9f5718de0f1;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.2.0 in central
	found io.delta#delta-storage;2.2.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 167ms :: artifacts dl 8ms
	:: modules in use:
	io.delta#delta-core_2.12;2.2.0 from central in [default]
	io.delta#delta-storage;2.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |  

24/02/15 02:35:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
local_table_path = "/home/donghee/work/deltalakeproject/data/delta-table-quickstart"

In [10]:
df = spark.read.format("delta").load(local_table_path)
type(df)

pyspark.sql.dataframe.DataFrame

### Local - Time Travel

In [15]:
df0 = spark.read.format("delta").option("versionAsOf", 0).load(local_table_path)
df1 = spark.read.format("delta").option("versionAsOf", 2).load(local_table_path)

In [16]:
df0.show()
df1.show()

+---+
| id|
+---+
|  2|
|  4|
|  0|
|  3|
|  1|
+---+

+---+
| id|
+---+
| 37|
| 35|
| 39|
| 33|
|138|
|136|
|130|
|132|
| 31|
|134|
+---+

