# Loading text files

`fugue` can read text files natively via `load` or by dropping into an execution engine

You might find it useful to use the execution engine directly for loading non-standard files or files that are not natively supported by `fugue`

We'll demonstrate `pandas`, `duckdb` & `dask` here

In [None]:
import os
import tempfile
import textwrap
import typing

import duckdb
from fugue import DataFrame
from fugue import ExecutionEngine
from fugue import FugueWorkflow
from fugue import NativeExecutionEngine
from fugue_dask import DaskExecutionEngine
from fugue_duckdb import DuckExecutionEngine
from fugue_sql import fsql
import pandas as pd

In [None]:
def create_temporary_file(
    _content: str, suffix: str, prefix: str="fugue_example_"
) -> str:
    text_file = tempfile.NamedTemporaryFile(
        suffix=suffix, prefix=prefix, delete=False
    )
    text_file.write(_content)
    return text_file.name

In [None]:
!rm /tmp/*.csv

zsh:1: no matches found: /tmp/*.csv


## Text files

Let's create a sample text file ...

In [None]:
content = textwrap.dedent("""\
    a,b,c
    1,2,3
    1,2,3"""
).encode("utf-8")

We can read it natively

In [None]:
csv_filepath = create_temporary_file(content, suffix=".csv")

dag = FugueWorkflow()
df = dag.load(csv_filepath, header=True)
df.show()
dag.run(engine="pandas")

os.unlink(csv_filepath)

PandasDataFrame
a:str|b:str|c:str
-----+-----+-----
1    |2    |3    
1    |2    |3    
Total count: 2



We can read multiple files using a wildcard `*` 

In [None]:
csv_filepath_1 = create_temporary_file(content, suffix=".csv")
csv_filepath_2 = create_temporary_file(content, suffix=".csv")
csv_filepath_wildcard = "/tmp/fugue_example_*.csv"

dag = FugueWorkflow()
df = dag.load(csv_filepath_wildcard, header=True)
df.show()
dag.run(engine="pandas")

os.unlink(csv_filepath_1)
os.unlink(csv_filepath_2)

PandasDataFrame
a:str|b:str|c:str
-----+-----+-----
1    |2    |3    
1    |2    |3    
1    |2    |3    
1    |2    |3    
Total count: 4



Or we can use the execution engine directly if your input file is non-standard

In [None]:
content = textwrap.dedent("""\
    date: 2022-10-17
    columns: a,b,c
    1,2,3
    1,2,3"""
).encode("utf-8")

In [None]:
def read_header(filepath: str) -> typing.List[str]:
    row_1 = pd.read_csv(filepath, skiprows=1, nrows=0).columns
    header = [row_1[0].replace("columns: ", ""), *row_1[1:]]
    return header

### `pandas`

In [None]:
def read_text_file(filepath: str) -> pd.DataFrame:
    headers = read_header(filepath)
    return pd.read_csv(filepath, skiprows=1, names=headers)

csv_filepath = create_temporary_file(content, suffix=".csv")

dag = FugueWorkflow()
df = dag.create(read_text_file, params={"filepath": csv_filepath})
df.show()
dag.run(engine="pandas")

os.unlink(csv_filepath)

PandasDataFrame
a:str                                                                                   |b:str|c:str
----------------------------------------------------------------------------------------+-----+-----
columns: a                                                                              |b    |c    
1                                                                                       |2    |3    
1                                                                                       |2    |3    
Total count: 3



We can also use the `fugue` `NativeExecutionEngine` which wraps `pandas` under the hood

In [None]:
def read_text_file(filepath: str) -> pd.DataFrame:
    headers = read_header(filepath)
    engine = NativeExecutionEngine()
    return engine.load_df(filepath, header=True, skiprows=1, names=headers)

csv_filepath = create_temporary_file(content, suffix=".csv")

dag = FugueWorkflow()
df = dag.create(read_text_file, params={"filepath": csv_filepath})
df.show()
dag.run(engine="pandas")

os.unlink(csv_filepath)

PandasDataFrame
a:str|b:str|c:str
-----+-----+-----
1    |2    |3    
1    |2    |3    
Total count: 2



### `duckdb`

> **Note:** `skip` & `columns` for `DuckExecutionEngine` correspond to `skiprows` & `names` for `pandas.read_csv` as `duckdb` `csv` has different conventions  

In [None]:
def read_text_file(filepath: str) -> DataFrame:
    headers = read_header(filepath)
    engine = DuckExecutionEngine()
    return engine.load_df(csv_filepath, skip=2, columns=headers)

csv_filepath = create_temporary_file(content, suffix=".csv")

dag = FugueWorkflow()
df = dag.create(read_text_file, params={"filepath": csv_filepath})
df.show()
dag.run(engine="duck")

os.unlink(csv_filepath)

DuckDataFrame
a:str|b:str|c:str
-----+-----+-----
1    |2    |3    
1    |2    |3    
Total count: 2



### `dask`

In [None]:
def read_text_file(filepath: str) -> DataFrame:
    headers = read_header(filepath)
    engine = DaskExecutionEngine()
    return engine.load_df(csv_filepath, header=True, skiprows=1, names=headers)

csv_filepath = create_temporary_file(content, suffix=".csv")

dag = FugueWorkflow()
df = dag.create(read_text_file, params={"filepath": csv_filepath})
df.show()
dag.run(engine="dask")

os.unlink(csv_filepath)

2022-10-19 20:20:17,548 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-60sqobj8', purging
2022-10-19 20:20:17,548 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-bed3uf74', purging
2022-10-19 20:20:17,548 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-4pnbyqsj', purging
2022-10-19 20:20:17,549 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-y_22uck1', purging
2022-10-19 20:20:17,549 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-be63tyrp', purging


DaskDataFrame
a:str|b:str|c:str
-----+-----+-----
1    |2    |3    
1    |2    |3    
Total count: 2

