## Adding your own data source

In [1]:
import os
import re
import pandas as pd

from flypipe.node import node
from flypipe.node_type import NodeType


def CSV(path_csv):
    @node(
        type="pandas",
        description=f"Loading CSV file at {path_csv}",
        tags=["datasource", "csv"]
    )
    def load_csv():
        return pd.read_csv(open(path_csv, 'r'))

    file_name = os.path.basename(path_csv)
    file_name = re.sub("[^\da-zA-Z]", "_", file_name)
    key = re.sub("[^\da-zA-Z]", "_", path_csv)

    load_csv.function.__name__ = file_name
    load_csv.key = key
    load_csv.node_type = NodeType.DATASOURCE
    return load_csv

#### Testing

In [2]:
import pytest
import pandas as pd
from pyspark_test import assert_pyspark_df_equal
from pandas.testing import assert_frame_equal

from flypipe.node import node
from flypipe.datasource.spark import Spark
from flypipe.schema.column import Column
from flypipe.schema.schema import Schema
from flypipe.schema.types import Long

#Fixtures
@pytest.fixture(scope="function")
def dummy_df():
    import pandas as pd
    
    #dummy dataframe
    df = pd.DataFrame(columns=["color", "fruit"])
    df.loc[0] = ["yellow", "lemon"]
    df.loc[0] = ["green", "lime"]
    
    #save dataframe 
    df.to_csv("/tmp/test.csv", index=False)
    
    return df

In [3]:
class TestPyspark:
    
    def test_(self, dummy_df):
        """
        Test a CSV datasource
        """
        
        @node(
            type="pandas",
            dependencies=[
                CSV("/tmp/test.csv").select(["fruit", "color"])
            ]
        )
        def t1(test_csv):
            return test_csv

        df = t1.run()
        assert_frame_equal(dummy_df, df)

In [4]:
import ipytest
ipytest.run()

platform linux -- Python 3.9.5, pytest-7.1.3, pluggy-1.0.0
rootdir: /notebooks/docs/tutorial
plugins: mock-3.9.0, anyio-3.6.2
collected 1 item

t_2e522ae4850d427cb03a56c369c75e1a.py .                                                      [100%]



<ExitCode.OK: 0>