In [None]:
from os.path import abspath

from pyspark.sql import SparkSession
from  pyspark.sql.catalog import Catalog
from pyspark.sql.functions import *
from pyspark.sql.types import LongType, StringType, DateType, IntegerType, FloatType, StructField, StructType
from pyspark.sql.functions import udf
import pandas as pd



warehousePath = abspath('spark_database')
sparkv2path = '/home/doug/ProjetosEstudo/LearningSparkV2/databricks-datasets/learning-spark-v2/'
sparkdefguidepath = '/home/doug/ProjetosEstudo/Spark-The-Definitive-Guide/data/'
master= "local[*]"
worker="172.28.170.236:37969"

spark = SparkSession.\
            builder.\
            appName('SparkSQLII').\
            master(master).\
            config("spark.sql.warehouse.dir", warehousePath).\
            config("spark.sql.catalogImplementation", "hive").\
            config("spark.sql.legacy.createHiveTableByDefault", "false").\
            enableHiveSupport().\
        getOrCreate()
            
spark
sampledata = abspath('/mnt/d/linux/datasource/data/flight-data/csv')

In [None]:
df = spark.sql('select 1')
df.show()

In [None]:
sourceschema = StructType([
                         StructField('InvoiceNo', IntegerType(), False),
                         StructField('StockCode', StringType(), False),
                         StructField('Description', StringType(), False),
                         StructField('Quantity', IntegerType(), False),
                         StructField('InvoiceDate', StringType(), False),
                         StructField('UnitPrice', FloatType(), False),
                         StructField('CustomerID', IntegerType(), False),
                         StructField('Country', StringType(), False)])
retailframe = spark.\
                read.\
                csv('/mnt/d/linux/datasource/data/retail-data/all/online-retail-dataset.csv',\
                            schema= sourceschema,\
                            header=True,\
                            sep=',',
                            enforceSchema= False).\
                withColumnRenamed('InvoiceNo', 'invoiceid').\
                withColumnRenamed('StockCode', 'stockcode').\
                withColumnRenamed('Description', 'desc').\
                withColumnRenamed('Quantity', 'quantity').\
                withColumnRenamed('InvoiceDate', 'orderdate').\
                withColumnRenamed('UnitPrice', 'unitprice').\
                withColumnRenamed('CustomerID', 'customerid').\
                withColumnRenamed('Country', 'country')
                
retailframe.createOrReplaceTempView('vwretailframe')



# InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country

In [None]:
sampleschema = StructType([
                            StructField('DEST_COUNTRY_NAME', StringType(), True),
                            StructField('ORIGIN_COUNTRY_NAME', StringType(), True),
                            StructField('count', IntegerType(), True)
                        
                        ])


sampleflight = spark.read.\
                    csv(f'{sampledata}/*.csv',
                        header=True,
                        schema= sampleschema,
                        ).\
                    withColumnRenamed('DEST_COUNTRY_NAME', 'destine').\
                    withColumnRenamed('ORIGIN_COUNTRY_NAME', 'origin')
                    
sampleflight.createOrReplaceTempView("vflight")

In [None]:
def flightcount(frame:str, columns:str, quantity:str, country: str):
    
    """ function that calculate flights by country """
    
    return spark.sql( 
              f""" 
              
              select
              
                {columns}, 
                sum({quantity})
                
                from {frame}
                where {columns} = '{country}'
                group by {columns}
                
              """
            )
    
# totalcount('vflight', 'origin', 'count', 'India').show()
unitedflight = flightcount('vflight', 'origin', 'count', 'United States')

In [None]:
totalfunction = udf(flightcount)

frametable = spark.table('vflight')

indiacount = frametable.select(totalfunction('frametable', 'origin', 'count', 'inidia'))
indiacount.show()

In [None]:
shretailframe = spark.sql(""" 
                            select
                                invoiceid,
                                orderdate,
                                stockcode,
                                quantity,
                                unitprice,
                                customerid,
                                country
                            from vwretailframe """)

shretailframe.show(10)

In [None]:
def squared(s):
    return s * s

spark.udf.register('squared', squared, FloatType())

powerprice = spark.sql("""
                       select 
                            cast(squared(unitprice) as numeric(12, 2)) as pwr_price
                        from vwretailframe """)

powerprice.show(5)

In [23]:
def doublecol (col1, col2):
    
    return col1 * col2

spark.udf.register('doublecol', doublecol, FloatType())

totalsell = spark.sql( """
                        select 
                            cast(doublecol(quantity, unitprice) as numeric(12,2)) as total
                        from vwretailframe """)
totalsell.show(5)

23/09/24 14:28:13 WARN SimpleFunctionRegistry: The function doublecol replaced a previously registered function.


+-----+
|total|
+-----+
|15.30|
|20.34|
|22.00|
|20.34|
|20.34|
+-----+
only showing top 5 rows



In [None]:
spark.range(1,30).createOrReplaceTempView("powerview")

spark.sql(""" select * from powerview """).show(5)


In [None]:
@udf("long") # type: ignore
def squared_fun(s):
    return s ** 2

df = spark.table("powerview")
df.select("id", squared_fun("id")).show()

In [None]:
#utilizando o UDF do módulo pyspark.sql.function, cria uma udf através de uma função
#sem precisar registrar

squaredfun = udf(squared, LongType())

df = spark.range(0,20, 2).createOrReplaceTempView("rangeview")

nt = spark.table("rangeview")
nt.select("id", squaredfun("id")).show()

In [24]:
# spark.sql(""" select * from vwretailframe """).show()

def customerclass(colqnt, colprice):
    
    rating = colqnt * colprice
    
    if rating > 10.00:
        return "Rating A"
    else:
        return "No"

spark.udf.register('custclass', customerclass, StringType())

# custclass = udf(customerclass, StringType())

createclass = spark.sql(""" select 
                            quantity, 
                            
                            unitprice,
                            custclass(quantity, unitprice) as rating
                        from vwretailframe """)

createclass.show()

23/09/24 14:28:39 WARN SimpleFunctionRegistry: The function custclass replaced a previously registered function.


+--------+---------+--------+
|quantity|unitprice|  rating|
+--------+---------+--------+
|       6|     2.55|Rating A|
|       6|     3.39|Rating A|
|       8|     2.75|Rating A|
|       6|     3.39|Rating A|
|       6|     3.39|Rating A|
|       2|     7.65|Rating A|
|       6|     4.25|Rating A|
|       6|     1.85|Rating A|
|       6|     1.85|Rating A|
|      32|     1.69|Rating A|
|       6|      2.1|Rating A|
|       6|      2.1|Rating A|
|       8|     3.75|Rating A|
|       6|     1.65|      No|
|       6|     4.25|Rating A|
|       3|     4.95|Rating A|
|       2|     9.95|Rating A|
|       3|     5.95|Rating A|
|       3|     5.95|Rating A|
|       4|     7.95|Rating A|
+--------+---------+--------+
only showing top 20 rows



In [None]:
#explicitando null check na função.
datasmp =[
        ('prod1', 10, 3.99),
        ('prod1', 20, 3.99),
        ('prod4', 5, 6.00),
        ('prod4', 4, 6.00),
        ('prod4', 5, 6.00),
        ('prod1', 4, 3.99),
        ('prod2', 5, 5.99),
        ('prod2', 5, 5.99),
        ('prod3', None, None),
        ('prod3', None, None),
        ('prod2', 5, 5.99)]

schematype = StructType([
                        StructField('produto', StringType(), True),
                        StructField('quantidade', IntegerType(), True),
                        StructField('preco', FloatType(), True)])

frameproduto = spark.createDataFrame(datasmp, schematype).fillna(0)
sampleprod = spark.createDataFrame(data= datasmp, schema= schematype).fillna(0).createOrReplaceTempView("sampleprod")

spark.sql(""" select * from sampleprod """).show()


In [None]:
#udf nullaware

# frameproduto.fillna(0)

def classprod(col_um, col_dois): 
    
    total = col_um * col_dois
    
    if total > '50.00':
        return 1.00
   
    elif total > '40.00' and total < '50.00':
        return 2.00
    
    else:
       return 3.00
   
spark.udf.register('classprod', classprod, FloatType())

rt_prod = spark.sql(""" 
            
                select 
                    produto,
                    quantidade,
                    preco,
                    classprod(quantidade, produto) as rating
                    
                from sampleprod """)
rt_prod.show()

In [None]:
newclass = udf(classprod, StringType())

# frameproduto.select('quantidade', 'preco', newclass('quantidade', 'preco')).show()


frameproduto.selectExpr('quantidade * preco').show()

frameproduto.fillna(0).show()