# Pandas and Pyarrow

In [None]:
import random
import string
import numpy as np
import pandas as pd
from datetime import datetime
import pathlib
import pyarrow as pa
import pyarrow.csv 
%load_ext memory_profiler

Create a large dataset

In [None]:
%%time
def gen_random_string(length:int=32) -> str:
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))
    
def make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None):

    dt = pd.date_range(start=start, end=end, freq=freq, name="timestamp")
    n = len(dt)
    np.random.seed = seed
    columns = {
        'date': dt,
        'cat': np.random.choice(['cat1','cat2','cat3','cat4','cat5'],n),
        'str1':[gen_random_string() for _ in range(n)],
        'str2':[gen_random_string() for _ in range(n)],
        'a': np.random.rand(n),
        'b': np.random.rand(n),
        'c': np.random.randint(1,100,n),
    }

    df = pd.DataFrame(columns, columns=columns)
    if df.index[-1] == end:
        df = df.iloc[:-1]
    return df

df = make_timeseries(start=datetime(2020,1,1), end=datetime(2023,12,31), freq='1min', seed=10)

Print the fisrt rows to see what the data looks like.

In [None]:
df.head()

Print the shape of the dataframe

In [None]:
df.shape

Print memory usage of the dataframe 

In [None]:
df.info(memory_usage="deep")

## Write and read the dataframe in CSV format 

First we will time reading and writing the dataframe in CSV format, in order to compare the performance of pyarrow.

Write dataframe in CSF format

In [None]:
%%time
# TODO

To check the file on the disk, you can use the `stat()`method of `pathlib.Path`.

In [None]:
# TODO

Write dataframe in CSV format with compression

In [None]:
%%time
# TODO

Check the file size

In [None]:
# TODO

Now, you can try to measure the performance in reading the dataframe in CSV format

First, try to read the CSV file

In [None]:
%%time
# TODO

Then, try to read the compressed CSV file

In [None]:
%%time
# TODO

## Use pyarrow

We will time reading and writing the dataframe in CSV format

First, copy the dataframe.

In [None]:
df_pa = df.copy()

Convert the data to pyarrow format

In [None]:
df_pa_table = pa.Table.from_pandas(df_pa)

In [None]:
df_pa_table

Write the dataframe with the `pyarrow.csv.write_csv` method

In [None]:
%%time
# TODO

Check the file size

In [None]:
# TODO

In [None]:
%%time
# TODO

Check the file size

In [None]:
# TODO

Read the dataframe. You can use the parameter `engine=pyarrow` in `pandas.read_csv`.

In [None]:
%%time
# TODO

## Convert to pyarrow types

It is also possible to use Pyarrow type PyArrow data structure integration is implemented through pandasâ€™ ExtensionArray interface.
Using Pyarrow datatypes enable to accelerate processing with PyArrow compute functions where available.

In [None]:
df_pa = df.copy()

In [None]:
df_pa.columns

Try to convert types with Pyarrow types.

In [None]:
# TODO

Check the types of the columns

In [None]:
df_pa.dtypes

In [None]:
df_pa.memory_usage(deep=True)