## Unit testing a node

Requirements:

* pytest>=7.1.3
* pytest-mock>=3.9.0
* pyspark-test>=0.2.0

In [1]:
import pytest
import pandas as pd
from pyspark_test import assert_pyspark_df_equal
from pandas.testing import assert_frame_equal

from flypipe.node import node
from flypipe.datasource.spark import Spark
from flypipe.schema.column import Column
from flypipe.schema.schema import Schema
from flypipe.schema.types import Long

#Fixtures
@pytest.fixture(scope="function")
def spark():
    # If running local, please set up your Spark environment
    from flypipe.tests.spark import spark

    # create a temporary view
    (
        spark.createDataFrame(
            schema=("c1", "c2", "c3"), data=[(1, 2, 3)]
        ).createOrReplaceTempView("dummy_table")
    )
    return spark

### Pyspark Node

In [2]:
class TestPyspark:
    
    def test_(self, spark):
        """
        Pyspark node test
        """
        
        expected_df = spark.createDataFrame(schema=("c1",), data=[(1,)])

        @node(
            type="pyspark",
            dependencies=[Spark("dummy_table").select("c1")],
            output=Schema([Column("c1", Long())]),
        )
        def t1(dummy_table):
            return dummy_table

        df = t1.run(spark)
        assert_pyspark_df_equal(df, expected_df)
        
    def test_pypsark_with_provided_inputs(self, spark):
        """
        Pyspark node test with provided inputs
        """
        
        dummy_table_df = pd.DataFrame(data={'c1': [10], 'c2': [20], 'c3': [30]})
        expected_df = spark.createDataFrame(schema=("c1",), data=[(10,)])

        @node(
            type="pyspark",
            dependencies=[Spark("dummy_table").select("c1")],
            output=Schema([Column("c1", Long())]),
        )
        def t1(dummy_table):
            return dummy_table

        df = t1.run(
            spark,
            inputs={
                Spark("dummy_table"): dummy_table_df
            }
        )
        
        assert_pyspark_df_equal(df, expected_df)

### Pandas on Spark Node

In [3]:
class TestPandasOnSpark:
    
    def test_pandas_on_spark_node(self, spark):
        """
        Pandas on Spark node test
        """
        
        expected_df = spark.createDataFrame(schema=("c1",), data=[(1,)]).pandas_api()

        @node(
            type="pandas_on_spark",
            dependencies=[Spark("dummy_table").select("c1")],
            output=Schema([Column("c1", Long())]),
        )
        def t1(dummy_table):
            return dummy_table

        df = t1.run(spark)
        assert_pyspark_df_equal(df.to_spark(), expected_df.to_spark())
        
    def test_pandas_on_spark_node_without_spark_context(self):
        """
        Pandas on Spark node test
        """
        
        dummy_table_df = pd.DataFrame(data={'c1': [10], 'c2': [20], 'c3': [30]})
        expected_df = pd.DataFrame(data={'c1': [10]})

        @node(
            type="pandas_on_spark",
            dependencies=[Spark("dummy_table").select("c1")],
            output=Schema([Column("c1", Long())]),
        )
        def t1(dummy_table):
            return dummy_table

        df = t1.run(
            pandas_on_spark_use_pandas=True, # <-- 
            inputs={
                Spark("dummy_table"): dummy_table_df
            }
        )
        assert_frame_equal(df, expected_df)        

### Pandas Node

In [4]:
class TestPandasNode:
    
    def test_(self, spark):
        """
        Pandas node test
        """
        
        expected_df = pd.DataFrame(data={'c1': [1]})

        @node(
            type="pandas",
            dependencies=[Spark("dummy_table").select("c1")],
            output=Schema([Column("c1", Long())]),
        )
        def t1(dummy_table):
            return dummy_table

        df = t1.run(spark)
        assert_frame_equal(df, expected_df)
        
        
    def test_pandas_with_provided_inputs(self):
        """
        Pandas node test with provided inputs
        
        NOTE: observe that spark is not used here
        """
        
        dummy_table_df = pd.DataFrame(data={'c1': [10], 'c2': [20], 'c3': [30]})
        expected_df = pd.DataFrame(data={'c1': [10]})

        @node(
            type="pandas",
            dependencies=[Spark("dummy_table").select("c1")],
            output=Schema([Column("c1", Long())]),
        )
        def t1(dummy_table):
            return dummy_table

        df = t1.run(
            inputs={
                Spark("dummy_table"): dummy_table_df
            }
        )
        assert_frame_equal(df, expected_df)

Running tests (**jupyter notebooks only**)

`pip install ipytest>=0.13.0`

In [5]:
import ipytest
ipytest.run()

platform linux -- Python 3.9.5, pytest-7.1.3, pluggy-1.0.0
rootdir: /notebooks/docs/tutorial
plugins: mock-3.9.0, anyio-3.6.2
collected 6 items

t_4ef93f119aed40fa9f8ef138aec469ba.py 

                                                                                

......                                                 [100%]

t_4ef93f119aed40fa9f8ef138aec469ba.py::TestPyspark::test_pypsark_with_provided_inputs
    [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]

    elif tpe in (bytes, np.character, np.bytes_, np.string_):

t_4ef93f119aed40fa9f8ef138aec469ba.py::TestPandasOnSpark::test_pandas_on_spark_node



<ExitCode.OK: 0>