In [1]:
import string
import random
import pandas as pd
from typing import List,Iterator,Tuple
from pyspark.sql.pandas.functions import pandas_udf
from pyspark.sql.functions import struct, col

from pyspark.sql.types import StringType,DoubleType
from sparkstudy.deploy.demo_sessions import DemoSQLSessionFactory
%load_ext autoreload
%autoreload 2
%matplotlib inline

COLUMNS = ["name","age","salary"]

比较下开不开启arrow的区别

测试下来，感觉性能提升有点奇怪。有时候会快，有时候会慢。

In [2]:
def create_random_data(row_num:int)->List[tuple]:
     result = list()
     a_str = string.ascii_uppercase
     for i in range(row_num):
         random_letter = random.choice(a_str)
         result.append((random_letter,random.randint(1,row_num),random.random()))
     return result

In [3]:
def test_performance(session_factory:DemoSQLSessionFactory, n:int = 100000):
    data = create_random_data(n)
    spark_session = session_factory.build_session()
    df = spark_session.createDataFrame(data,COLUMNS).cache()
    df.toPandas().head(5)

In [4]:
session_factory_normal = DemoSQLSessionFactory(name="normal")
%time test_performance(session_factory_normal)

CPU times: user 3.05 s, sys: 51.7 ms, total: 3.1 s
Wall time: 10.8 s


In [5]:
session_factory_arrow = DemoSQLSessionFactory(name="with arraw")
session_factory_arrow.add_config("spark.sql.execution.arrow.pyspark.enabled","true")
%time test_performance(session_factory_arrow)

CPU times: user 2.84 s, sys: 22.3 ms, total: 2.86 s
Wall time: 3.41 s


常规的HelloWorld的example。
页面上面的第一个例子。本质就是生成一个新的dataframe
1. 在annotation上面列出的是新的dataframe的col和类型
2. 他会自动的把pd的转换成spark的
3. 函数应该会分批node执行。然后再汇总。因为我看到了。hello world的函数会被执行好几次

In [6]:
session_factory_arrow.add_config('spark.sql.execution.arrow.maxRecordsPerBatch',10)
spark = session_factory_arrow.build_session()
test_data = create_random_data(row_num=1000)
basic_df = spark.createDataFrame(test_data,COLUMNS)
basic_df.show()

+----+---+-------------------+
|name|age|             salary|
+----+---+-------------------+
|   I|625| 0.7905260172728258|
|   L|121|0.15568573810904118|
|   C|507|  0.773368400830384|
|   I|741| 0.5315484410622431|
|   P|967| 0.7963605997989015|
|   E| 97| 0.8525953416181977|
|   C|447|0.24632034189710417|
|   J|132| 0.7617200252444477|
|   E|276|0.35385200538445893|
|   E|112| 0.6746114657290763|
|   J|466| 0.8844738355236132|
|   S|644|0.07209591236191981|
|   C|662|   0.92176358697027|
|   L|612|0.23139110708692412|
|   U|638| 0.2426797460194945|
|   I| 96| 0.5869054936620904|
|   T|745|0.47041040019443814|
|   J|687| 0.3531786496215842|
|   L|155| 0.1903792605123351|
|   K|649|0.18742033516472556|
+----+---+-------------------+
only showing top 20 rows



In [7]:
@pandas_udf("total double")
def func(s1: pd.Series, s2: pd.Series) -> pd.DataFrame:
    print("execute")
    s3 = pd.DataFrame()
    s3['total'] = s1 + s2
    return s3
basic_df.select(func("age","salary").alias("result")).show()

+--------------------+
|              result|
+--------------------+
| [625.7905260172728]|
|[121.15568573810904]|
| [507.7733684008304]|
| [741.5315484410622]|
| [967.7963605997988]|
|  [97.8525953416182]|
| [447.2463203418971]|
|[132.76172002524444]|
|[276.35385200538445]|
|[112.67461146572907]|
|[466.88447383552364]|
| [644.0720959123619]|
| [662.9217635869703]|
| [612.2313911070869]|
| [638.2426797460195]|
| [96.58690549366209]|
| [745.4704104001944]|
| [687.3531786496216]|
|[155.19037926051234]|
| [649.1874203351647]|
+--------------------+
only showing top 20 rows



主要是想要看看。select方法，不能不能接受一个List

In [8]:
def to_str_func(s1: pd.Series) -> pd.Series:
    return s1.astype(dtype=str)
to_str = pandas_udf(to_str_func, returnType=StringType())

age_c = to_str("age").alias("age")
salary_c = to_str("salary").alias("salary")
selects = [age_c,salary_c]
basic_df.select(selects).show()

+---+-------------------+
|age|             salary|
+---+-------------------+
|625| 0.7905260172728258|
|121|0.15568573810904118|
|507|  0.773368400830384|
|741| 0.5315484410622431|
|967| 0.7963605997989015|
| 97| 0.8525953416181977|
|447|0.24632034189710417|
|132| 0.7617200252444477|
|276|0.35385200538445893|
|112| 0.6746114657290763|
|466| 0.8844738355236132|
|644|0.07209591236191981|
|662|   0.92176358697027|
|612|0.23139110708692412|
|638| 0.2426797460194945|
| 96| 0.5869054936620904|
|745|0.47041040019443814|
|687| 0.3531786496215842|
|155| 0.1903792605123351|
|649|0.18742033516472556|
+---+-------------------+
only showing top 20 rows



测试以下。如果参数是不定的行不行

简单的来书，
- 确定的column个数，用Series
- 不确定用dataframe
- iterator是类似用流

In [9]:
@pandas_udf("double")
def to_sum_func(data: pd.DataFrame) -> pd.Series:
    return data.age*data.salary
cols = [col("age"),col("salary")]
headers = struct(cols)
#my_sum = pandas_udf(to_sum_func, returnType=DoubleType())
basic_df.select(to_sum_func(headers).alias("result")).show()



+------------------+
|            result|
+------------------+
|494.07876079551613|
| 18.83797431119398|
| 392.0977792210047|
|393.87739482712215|
| 770.0807000055378|
| 82.70174813696518|
|110.10519282800556|
| 100.5470433322671|
| 97.66315348611067|
| 75.55648416165654|
|412.16480735400376|
| 46.42976756107636|
| 610.2074945743187|
|141.61135753719756|
| 154.8296779604375|
| 56.34292739156068|
| 350.4557481448564|
|242.63373229002835|
|29.508785379411943|
| 121.6357975219069|
+------------------+
only showing top 20 rows

