### O.S. process

In [1]:
import warnings
warnings.filterwarnings('ignore')

from os import environ

!host=$(hostname) 
!ip=$(ifconfig | grep 'inet ' | grep -v '127.0.0.1' | cut -c 7-17)
!echo "hostname: $(hostname)"

# create path if not exists
#!mkdir -p ~/notebooks/data/

# download if not exists
#!wget -nc https://files.grouplens.org/datasets/movielens/ml-25m.zip -P ~/notebooks/data/

# unzip if not exists
#!unzip -n ~/notebooks/data/ml-25m.zip -d ~/notebooks/data/

#!ls -las /home/admin/notebooks/data/ml-25m

# check environment variables: JAVA_HOME
!export JAVA_HOME=/opt/jdk
environ["JAVA_HOME"] = "/opt/jdk"
!echo "- JAVA_HOME:$JAVA_HOME"

# check environment variables: PYSPARK_SUBMIT_ARGS
!export PYSPARK_SUBMIT_ARGS='--packages io.delta:delta-core_2.12:2.1.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" pyspark-shell'
environ["PYSPARK_SUBMIT_ARGS"]='--packages io.delta:delta-core_2.12:2.1.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" pyspark-shell'
!echo "- PYSPARK_SUBMIT_ARGS:$PYSPARK_SUBMIT_ARGS"

# check environment variables: PATH
!export PATH=$PATH:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/jdk:/opt/jdk/bin
environ["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/jdk:/opt/jdk/bin"
!echo "- PATH:$PATH"

!pip freeze

# check java version
!java -version

hostname: jupyter-hub
- JAVA_HOME:/opt/jdk
- PYSPARK_SUBMIT_ARGS:--packages io.delta:delta-core_2.12:2.1.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" pyspark-shell
- PATH:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/jdk:/opt/jdk/bin
dbus-python==1.2.18
delta-spark==2.1.0
gyp==0.1
importlib-metadata==5.0.0
numpy==1.23.4
py4j==0.10.9.5
pyarrow==9.0.0
PyGObject==3.42.1
pyspark==3.3.0
zipp==3.9.0
java version "1.8.0_341"
Java(TM) SE Runtime Environment (build 1.8.0_341-b10)
Java HotSpot(TM) 64-Bit Server VM (build 25.341-b10, mixed mode)


### Function to reduce memory usage in Pandas DataFrame

In [4]:
import numpy as np

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%\n'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### Connect from Apache Spark Cluster - without Delta Lake

In [5]:
#from os import environ
#environ["SPARK_HOME"] = '/opt/apache-spark'
#environ["PATH"] = '$PATH:/opt/jdk:/opt/jdk/bin:/opt/apache-spark:/opt/apache-spark/bin:/opt/apache-spark/sbin'

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, DataFrameReader

#If you need to stop SparkContext (sc) or SparkSession
if 's_session' in locals():
    s_session.stop()
if 's_context' in locals():
    s_context.stop()

conf = SparkConf()

conf.setAppName("app_data_lake") \
.setMaster("spark://spark-master:7077")
#.setSparkHome("/opt/apache-spark")

s_context = SparkContext(conf=conf).getOrCreate()
s_session = SparkSession(sparkContext=s_context)
#builder = s_session.builder

:: loading settings :: url = jar:file:/srv/jupyterhub/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/admin/.ivy2/cache
The jars for the packages stored in: /home/admin/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-90bea128-ec4f-4635-bbc3-dedb762cec9c;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 134ms :: artifacts dl 7ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evict

22/10/25 13:36:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Read .CSV from SFTP and load into a Pandas DataFrame

In [6]:
import pysftp
from pandas import read_csv as pandas_read_csv
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# SFTP config connection
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
environ["FTP_HOST"] = 'sftp-01' # sftp-01 = 172.19.0.15
environ["FTP_PORT"] = '2222'
environ["FTP_USER"] = 'admin'
environ["FTP_PASS"] = 'admin'

# CSV schema
schema_doc = {
                "tags.csv": StructType([StructField("userId", IntegerType(), True),
                                     StructField("movieId", IntegerType(), True),
                                     StructField("tag", StringType(), True),
                                     StructField("timestamp", IntegerType(), True)]),
                "ratings.csv": StructType([StructField("userId", IntegerType(), True),
                                     StructField("movieId", IntegerType(), True),
                                     StructField("rating", FloatType(), True),
                                     StructField("timestamp", IntegerType(), True)])
                }

chunksize=500000
sftp_file="tags.csv" # 1.093.000 lines
#sftp_file="ratings.csv" # 25.000.000 lines

# open SFTP connection
with pysftp.Connection(environ["FTP_HOST"], port = int(environ["FTP_PORT"]), username = environ["FTP_USER"], password = environ["FTP_PASS"], cnopts=cnopts) as connection:
    print("Connection succesfully established…\n")
    # open the file
    with connection.open(remote_file = f"/data/{sftp_file}", mode='r') as file:

        i = 1
        for reader in pandas_read_csv(file, sep=',', chunksize=chunksize):
            chnk = "0 until " + str(chunksize) if i==1 else str(((chunksize * i) - chunksize) + 1) + " until " + str(chunksize * i)
            print(f"Chunksize block = line {chnk}")
            reader=reduce_mem_usage(df=reader)
            if i == 1:
                data = s_session.createDataFrame(data=reader, schema=schema_doc.get(sftp_file, sftp_file.split('.')[0]))
            else:
                new_data = s_session.createDataFrame(data=reader, schema=schema_doc.get(sftp_file, sftp_file.split('.')[0]))
                data = data.union(new_data)
                del new_data
            i = i + 1

connection.close()

Connection succesfully established…

Chunksize block = line 0 until 500000
Memory usage of dataframe is 15.26 MB
Memory usage after optimization is: 8.99 MB
Decreased by 41.1%

Chunksize block = line 500001 until 1000000
Memory usage of dataframe is 15.26 MB
Memory usage after optimization is: 8.97 MB
Decreased by 41.2%

Chunksize block = line 1000001 until 1500000
Memory usage of dataframe is 2.85 MB
Memory usage after optimization is: 1.88 MB
Decreased by 33.9%



### Resume of data

In [7]:
print( "- sparkSession: ", data.sparkSession, '\n' )
print("- Object: ", type(data), "\n")
print( "- schema: ", data.schema, '\n' )
print( "- printSchema: ", data.printSchema(), '\n' )
print( "- isStreaming: ", data.isStreaming, '\n' )
print( "- columns: ", data.columns, '\n' )
print( "- dtypes: ", data.dtypes, '\n' )
print( "- head: ", data.head(10), '\n' )
print( "- show: ", data.show(10), '\n' )
print( "- isEmpty: ", data.isEmpty(), '\n' )
print("- cache", data.cache(), '\n' ) # Persists the DataFrame with the default storage level (MEMORY_AND_DISK)
print( "- persist: ", data.persist(), '\n' ) # Sets the storage level to persist the contents of the DataFrame across operations after the first time it is computed.
print( "- storageLevel: ", data.storageLevel, '\n' )
print( "- count: ", data.count(), '\n' )
if sftp_file=="ratings.csv":
    print( "- correlation between rating and timestamp: ", data.corr("rating", "timestamp"), '\n' )
    print( "- covariance between rating and timestamp: ", data.cov("rating", "timestamp"), '\n' ) # Calculate the sample covariance for the given columns, specified by their names, as a double value
    print( "- descriptive statistics: ", data.describe(["userId", "movieId", "rating", "timestamp"]).show(), '\n' )
print( "- summary: ", data.summary().show(), '\n' ) # Computes specified statistics for numeric and string columns

- sparkSession:  <pyspark.sql.session.SparkSession object at 0x7f593c54beb0> 

- Object:  <class 'pyspark.sql.dataframe.DataFrame'> 

- schema:  StructType([StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('tag', StringType(), True), StructField('timestamp', IntegerType(), True)]) 

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: integer (nullable = true)

- printSchema:  None 

- isStreaming:  False 

- columns:  ['userId', 'movieId', 'tag', 'timestamp'] 

- dtypes:  [('userId', 'int'), ('movieId', 'int'), ('tag', 'string'), ('timestamp', 'int')] 



                                                                                

- head:  [Row(userId=3, movieId=260, tag='classic', timestamp=1439472355), Row(userId=3, movieId=260, tag='sci-fi', timestamp=1439472256), Row(userId=4, movieId=1732, tag='dark comedy', timestamp=1573943598), Row(userId=4, movieId=1732, tag='great dialogue', timestamp=1573943604), Row(userId=4, movieId=7569, tag="so bad it's good", timestamp=1573943455), Row(userId=4, movieId=44665, tag='unreliable narrators', timestamp=1573943619), Row(userId=4, movieId=115569, tag='tense', timestamp=1573943077), Row(userId=4, movieId=115713, tag='artificial intelligence', timestamp=1573942979), Row(userId=4, movieId=115713, tag='philosophical', timestamp=1573943033), Row(userId=4, movieId=115713, tag='tense', timestamp=1573943042)] 



                                                                                

+------+-------+--------------------+----------+
|userId|movieId|                 tag| timestamp|
+------+-------+--------------------+----------+
|     3|    260|             classic|1439472355|
|     3|    260|              sci-fi|1439472256|
|     4|   1732|         dark comedy|1573943598|
|     4|   1732|      great dialogue|1573943604|
|     4|   7569|    so bad it's good|1573943455|
|     4|  44665|unreliable narrators|1573943619|
|     4| 115569|               tense|1573943077|
|     4| 115713|artificial intell...|1573942979|
|     4| 115713|       philosophical|1573943033|
|     4| 115713|               tense|1573943042|
+------+-------+--------------------+----------+
only showing top 10 rows

- show:  None 

- isEmpty:  False 

- cache DataFrame[userId: int, movieId: int, tag: string, timestamp: int] 

22/10/25 13:37:33 WARN CacheManager: Asked to cache already cached data.
- persist:  DataFrame[userId: int, movieId: int, tag: string, timestamp: int] 

- storageLevel:  Disk M

                                                                                

- count:  1093360 



[Stage 8:>                                                          (0 + 1) / 1]

+-------+-----------------+------------------+--------------------+--------------------+
|summary|           userId|           movieId|                 tag|           timestamp|
+-------+-----------------+------------------+--------------------+--------------------+
|  count|          1093360|           1093360|             1093360|             1093360|
|   mean|67590.22463324065|  58492.7644389771|                 NaN|  1.43011549764337E9|
| stddev|51521.13756056978|59687.312817478196|                 NaN|1.1773844833352971E8|
|    min|                3|                 1| Alexander Skarsgård|          1135429210|
|    25%|            15204|              3504|                 3.0|          1339252662|
|    50%|            62199|             45928|                 3.5|          1468921818|
|    75%|           113652|            102903|              1929.0|          1527402191|
|    max|           162534|            209063|          카운트다운|          1574316696|
+-------+-----------------

                                                                                

### Write DataFrame in HDFS

In [8]:
data.write.csv("hdfs://hdpmaster:9000/users/hduser/teste1.csv", header=True, mode="ignore")
data.write.parquet("hdfs://hdpmaster:9000/users/hduser/teste1.parquet", mode="ignore")

### Read data from HDFS

In [9]:
df_load_csv = s_session.read.csv("hdfs://hdpmaster:9000/users/hduser/teste1.csv", header='true', inferSchema='true')
df_load_parquet = s_session.read.parquet("hdfs://hdpmaster:9000/users/hduser/teste1.parquet")

                                                                                

In [10]:
print("CSV FILE:", "\n")
print( "- sparkSession: ", df_load_csv.sparkSession, '\n' )
print("- Object: ", type(df_load_csv), "\n")
print( "- schema: ", df_load_csv.schema, '\n' )
print( "- printSchema: ", df_load_csv.printSchema(), '\n' )
print( "- isStreaming: ", df_load_csv.isStreaming, '\n' )
print( "- columns: ", df_load_csv.columns, '\n' )
print( "- dtypes: ", df_load_csv.dtypes, '\n' )
print( "- head: ", df_load_csv.head(10), '\n' )
print( "- show: ", df_load_csv.show(10), '\n' )
print("##########################################################")
print("PARQUET FILE:", "\n")
print( "- sparkSession: ", df_load_parquet.sparkSession, '\n' )
print("- Object: ", type(df_load_parquet), "\n")
print( "- schema: ", df_load_parquet.schema, '\n' )
print( "- printSchema: ", df_load_parquet.printSchema(), '\n' )
print( "- isStreaming: ", df_load_parquet.isStreaming, '\n' )
print( "- columns: ", df_load_parquet.columns, '\n' )
print( "- dtypes: ", df_load_parquet.dtypes, '\n' )
print( "- head: ", df_load_parquet.head(10), '\n' )
print( "- show: ", df_load_parquet.show(10), '\n' )

CSV FILE: 

- sparkSession:  <pyspark.sql.session.SparkSession object at 0x7f593c54beb0> 

- Object:  <class 'pyspark.sql.dataframe.DataFrame'> 

- schema:  StructType([StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('tag', StringType(), True), StructField('timestamp', IntegerType(), True)]) 

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: integer (nullable = true)

- printSchema:  None 

- isStreaming:  False 

- columns:  ['userId', 'movieId', 'tag', 'timestamp'] 

- dtypes:  [('userId', 'int'), ('movieId', 'int'), ('tag', 'string'), ('timestamp', 'int')] 

- head:  [Row(userId=61624, movieId=193477, tag='production design', timestamp=1541821771), Row(userId=61624, movieId=193477, tag='scotland', timestamp=1541821809), Row(userId=61624, movieId=193477, tag='sword and sandals', timestamp=1541821819), Row(userId=61624, movieId=193477, tag='tone', 

                                                                                

- head:  [Row(userId=61624, movieId=193477, tag='production design', timestamp=1541821771), Row(userId=61624, movieId=193477, tag='scotland', timestamp=1541821809), Row(userId=61624, movieId=193477, tag='sword and sandals', timestamp=1541821819), Row(userId=61624, movieId=193477, tag='tone', timestamp=1541872454), Row(userId=61624, movieId=200814, tag='Kiernan Shipka', timestamp=1554957611), Row(userId=61624, movieId=200814, tag='script', timestamp=1554957625), Row(userId=61624, movieId=200814, tag='Stanley Tucci', timestamp=1554957622), Row(userId=61626, movieId=724, tag='magic', timestamp=1375489450), Row(userId=61626, movieId=2810, tag='Satoshi Kon', timestamp=1375246095), Row(userId=61626, movieId=4370, tag='dystopia', timestamp=1375246046)] 

+------+-------+-----------------+----------+
|userId|movieId|              tag| timestamp|
+------+-------+-----------------+----------+
| 61624| 193477|production design|1541821771|
| 61624| 193477|         scotland|1541821809|
| 61624| 193

### Create a temporary table

In [11]:
s_session.sql("CREATE TEMPORARY VIEW teste USING parquet OPTIONS (path \"hdfs://hdpmaster:9000/users/hduser/teste1.parquet\")")

DataFrame[]

In [12]:
s_session.sql("select * from teste limit 10").show(truncate=False)



+------+-------+-----------------+----------+
|userId|movieId|tag              |timestamp |
+------+-------+-----------------+----------+
|61624 |193477 |production design|1541821771|
|61624 |193477 |scotland         |1541821809|
|61624 |193477 |sword and sandals|1541821819|
|61624 |193477 |tone             |1541872454|
|61624 |200814 |Kiernan Shipka   |1554957611|
|61624 |200814 |script           |1554957625|
|61624 |200814 |Stanley Tucci    |1554957622|
|61626 |724    |magic            |1375489450|
|61626 |2810   |Satoshi Kon      |1375246095|
|61626 |4370   |dystopia         |1375246046|
+------+-------+-----------------+----------+



                                                                                