# Imports

In [1]:
import pyspark.sql.functions as psf
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.types import StringType, StructType

# Session

In [2]:

class SESSION_TYPES:
    LOCAL_SESSION = "local"

def build_local_session():
    return SparkSession.builder.\
            config("spark.driver.host", "localhost")\
            .config("spark.eventLog.enabled", "true")\
            .config("spark.eventLog.dir", "/tmp")\
            .config("spark.history.fs.logDirectory", "/tmp")\
            .appName("Test-multimno-JL").getOrCreate()

build_session = {SESSION_TYPES.LOCAL_SESSION: build_local_session}

In [3]:
SESSION = SESSION_TYPES.LOCAL_SESSION

spark = build_session[SESSION]()

sc = spark.sparkContext.getOrCreate()
sc

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/08 13:39:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Variables

In [4]:
input_path = "/opt/data/input/od_pairs.txt"
output_path = "/opt/data/output/od_pairs.parquet"

# Extract

In [5]:
schema = StructType() \
      .add("origin",StringType(),True) \
      .add("destination",StringType(),True)
    
df = spark.read.options(delimiter='|', header=True).schema(schema).csv(input_path)

df.show()

+---------+-----------+
|   origin|destination|
+---------+-----------+
|1:10.2:21|  1:10.2:23|
|1:10.2:21|  1:10.2:24|
|1:10.2:21|  1:10.2:25|
|1:10.2:21|  1:10.2:26|
+---------+-----------+



                                                                                

# Transform

In [6]:
df = df.withColumn('test', psf.lit('patata'))
df.show()

+---------+-----------+------+
|   origin|destination|  test|
+---------+-----------+------+
|1:10.2:21|  1:10.2:23|patata|
|1:10.2:21|  1:10.2:24|patata|
|1:10.2:21|  1:10.2:25|patata|
|1:10.2:21|  1:10.2:26|patata|
+---------+-----------+------+



# Load

In [7]:
df.write.mode('overwrite').parquet(output_path)

In [8]:
df = spark.read.parquet(output_path)
df.show()

+---------+-----------+------+
|   origin|destination|  test|
+---------+-----------+------+
|1:10.2:21|  1:10.2:23|patata|
|1:10.2:21|  1:10.2:24|patata|
|1:10.2:21|  1:10.2:25|patata|
|1:10.2:21|  1:10.2:26|patata|
+---------+-----------+------+

