# Apache Arrow
https://arrow.apache.org/docs/python/index.html

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn import datasets

In [2]:
# load wine dataset from sklearn
wine = datasets.load_wine()

# load wine dataset as pandas dataframe
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [3]:
# table from pd dataframe
table_from_df = pa.Table.from_pandas(df)
table_from_df.schema

alcohol: double
malic_acid: double
ash: double
alcalinity_of_ash: double
magnesium: double
total_phenols: double
flavanoids: double
nonflavanoid_phenols: double
proanthocyanins: double
color_intensity: double
hue: double
od280/od315_of_diluted_wines: double
proline: double
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1887

In [4]:
# data access for single column
table_from_df["malic_acid"]

<pyarrow.lib.ChunkedArray object at 0x7fd3bdd98290>
[
  [
    1.71,
    1.78,
    2.36,
    1.95,
    2.59,
    1.76,
    1.87,
    2.15,
    1.64,
    1.35,
    ...
    2.58,
    4.6,
    3.03,
    2.39,
    2.51,
    5.65,
    3.91,
    4.28,
    2.59,
    4.1
  ]
]

In [5]:
# export pd dataframe to python dict
data_as_dict = df.to_dict(orient="list")
print(len(data_as_dict))
data_as_dict["alcohol"][0:10]

13


[14.23, 13.2, 13.16, 14.37, 13.24, 14.2, 14.39, 14.06, 14.83, 13.86]

In [6]:
# table from python dict
table_from_dict = pa.Table.from_pydict(data_as_dict)
table_from_dict.schema

alcohol: double
malic_acid: double
ash: double
alcalinity_of_ash: double
magnesium: double
total_phenols: double
flavanoids: double
nonflavanoid_phenols: double
proanthocyanins: double
color_intensity: double
hue: double
od280/od315_of_diluted_wines: double
proline: double

In [7]:
# export python dict to list of pa arrays
data_as_pa_arrays= []
for key,val in data_as_dict.items():
  data_as_pa_arrays.append(pa.array(val))
len(data_as_pa_arrays)

13

In [8]:
# recordbatch from list of pa arrays
rb_from_arrays = pa.RecordBatch.from_arrays(data_as_pa_arrays, list(data_as_dict.keys()))
rb_from_arrays.schema

alcohol: double
malic_acid: double
ash: double
alcalinity_of_ash: double
magnesium: double
total_phenols: double
flavanoids: double
nonflavanoid_phenols: double
proanthocyanins: double
color_intensity: double
hue: double
od280/od315_of_diluted_wines: double
proline: double

In [9]:
# recordbatch to pandas
rb_from_arrays.to_pandas()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [10]:
# recordbatch to table
table_from_rb = pa.Table.from_batches([rb_from_arrays])
table_from_rb.schema

alcohol: double
malic_acid: double
ash: double
alcalinity_of_ash: double
magnesium: double
total_phenols: double
flavanoids: double
nonflavanoid_phenols: double
proanthocyanins: double
color_intensity: double
hue: double
od280/od315_of_diluted_wines: double
proline: double

In [11]:
# table to parquet
pq.write_table(table_from_rb, "sample.parquet")

In [12]:
# parquet to table
pq.read_table("sample.parquet")

pyarrow.Table
alcohol: double
malic_acid: double
ash: double
alcalinity_of_ash: double
magnesium: double
total_phenols: double
flavanoids: double
nonflavanoid_phenols: double
proanthocyanins: double
color_intensity: double
hue: double
od280/od315_of_diluted_wines: double
proline: double

In [13]:
# emit error for inequal length arrays in table
pa.Table.from_arrays([pa.array([0, 1, 2, 3, 4, 5]), 
                      pa.array([.1, .2, .3])], 
                     names=["col1","col2"])

ArrowInvalid: ignored

In [14]:
# correct the error for inequal length arrays in table
pa.Table.from_arrays([pa.array([0, 1, 2, 3, 4, 5]), 
                      pa.array([0, .1, .2, .3, .4, .5])], 
                     names=["col1","col2"])

pyarrow.Table
col1: int64
col2: double

In [15]:
# pa array
pa_arr = pa.array([0, 1, 2, 3, 4, 5])
pa_arr

<pyarrow.lib.Int64Array object at 0x7fd3bdd8ede0>
[
  0,
  1,
  2,
  3,
  4,
  5
]

In [16]:
# pa array to pandas
pa_arr.to_pandas()

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64