In [1]:
import pandas as pd
import pyarrow as pa

In [2]:
df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
print("Original DataFrame:")
print(df)

table = pa.Table.from_pandas(df)
print("\nConverted to Arrow Table:")
print(table)

df_new = table.to_pandas()
print("\nConverted back to DataFrame:")
print(df_new)

Original DataFrame:
   a  b
0  1  x
1  2  y
2  3  z

Converted to Arrow Table:
pyarrow.Table
a: int64
b: string
----
a: [[1,2,3]]
b: [["x","y","z"]]

Converted back to DataFrame:
   a  b
0  1  x
1  2  y
2  3  z


In [3]:
# Create a DataFrame with a custom index.
df_indexed = pd.DataFrame({"a": [10, 20, 30], "b": ["foo", "bar", "baz"]},
                          index=["row1", "row2", "row3"])
print("Original DataFrame with index:")
print(df_indexed)

# Convert to an Arrow Table (non-RangeIndex will be stored as a column).
table_indexed = pa.Table.from_pandas(df_indexed)
print("\nArrow Table (with index preserved):")
print(table_indexed)

# Convert back to Pandas.
df_roundtrip = table_indexed.to_pandas()
print("\nRoundtrip DataFrame:")
print(df_roundtrip)


Original DataFrame with index:
       a    b
row1  10  foo
row2  20  bar
row3  30  baz

Arrow Table (with index preserved):
pyarrow.Table
a: int64
b: string
__index_level_0__: string
----
a: [[10,20,30]]
b: [["foo","bar","baz"]]
__index_level_0__: [["row1","row2","row3"]]

Roundtrip DataFrame:
       a    b
row1  10  foo
row2  20  bar
row3  30  baz


In [4]:

df_cat = pd.DataFrame({"cat": pd.Categorical(["a", "b", "c", "a", "b", "c"])})
print("Original DataFrame with categorical column:")
print(df_cat)
print("Categories:", df_cat["cat"].cat.categories)


table_cat = pa.Table.from_pandas(df_cat)
print("\nArrow Table with categorical column:")
print(table_cat)


col = table_cat.column("cat")

chunk = col.chunk(0)
print("\nUnderlying dictionary in 'cat' column:")
print("Dictionary:", chunk.dictionary)
print("Indices:", chunk.indices)


Original DataFrame with categorical column:
  cat
0   a
1   b
2   c
3   a
4   b
5   c
Categories: Index(['a', 'b', 'c'], dtype='object')

Arrow Table with categorical column:
pyarrow.Table
cat: dictionary<values=string, indices=int8, ordered=0>
----
cat: [  -- dictionary:
["a","b","c"]  -- indices:
[0,1,2,0,1,2]]

Underlying dictionary in 'cat' column:
Dictionary: [
  "a",
  "b",
  "c"
]
Indices: [
  0,
  1,
  2,
  0,
  1,
  2
]


In [6]:
# Create a DataFrame with a nullable integer column.
df_nullable = pd.DataFrame({"a": pd.Series([1, 2, None], dtype="Int64")})
print("Original DataFrame with nullable dtype:")
print(df_nullable)
print("Dtypes:")
print(df_nullable.dtypes)


Original DataFrame with nullable dtype:
      a
0     1
1     2
2  <NA>
Dtypes:
a    Int64
dtype: object


In [7]:

# Convert to an Arrow Table.
table_nullable = pa.Table.from_pandas(df_nullable)
print("\nArrow Table from DataFrame with nullable dtype:")
print(table_nullable)



Arrow Table from DataFrame with nullable dtype:
pyarrow.Table
a: int64
----
a: [[1,2,null]]


In [8]:
# Define a mapping from Arrow types to Pandas nullable dtypes.
dtype_mapping = {
    pa.int64(): pd.Int64Dtype(),
    pa.float64(): pd.Float64Dtype(),
    pa.string(): pd.StringDtype(),
}


In [9]:

# Convert back to a Pandas DataFrame using the types_mapper.
df_nullable_roundtrip = table_nullable.to_pandas(types_mapper=dtype_mapping.get)
print("\nRoundtrip DataFrame with nullable dtypes:")
print(df_nullable_roundtrip)
print("Dtypes after conversion:")
print(df_nullable_roundtrip.dtypes)



Roundtrip DataFrame with nullable dtypes:
      a
0     1
1     2
2  <NA>
Dtypes after conversion:
a    Int64
dtype: object
