The pandas I/O System
=====================



In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

## CSV - basic reading / writing



### How to do it



In [3]:
df = pd.DataFrame([
    ["Paul", "McCartney", 1942],
    ["John", "Lennon", 1940],
    ["Richard", "Starkey", 1940],
    ["George", "Harrison", 1943],
], columns=["first", "last", "birth"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [4]:
import io
buf = io.StringIO()

df.to_csv(buf)
print(buf.getvalue())

,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943



In [4]:
buf.seek(0)
pd.read_csv(buf, dtype_backend="numpy_nullable")

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,McCartney,1942
1,1,John,Lennon,1940
2,2,Richard,Starkey,1940
3,3,George,Harrison,1943


In [5]:
buf.seek(0)
pd.read_csv(buf, dtype_backend="numpy_nullable", index_col=0)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [6]:
buf = io.StringIO()
df.to_csv(buf, index=False)
print(buf.getvalue())

first,last,birth
Paul,McCartney,1942
John,Lennon,1940
Richard,Starkey,1940
George,Harrison,1943



### There's more&#x2026;



In [7]:
df = pd.DataFrame([
    ["McCartney, Paul", 1942],
    ["Lennon, John", 1940],
    ["Starkey, Richard", 1940],
    ["Harrison, George", 1943],
], columns=["name", "birth"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,name,birth
0,"McCartney, Paul",1942
1,"Lennon, John",1940
2,"Starkey, Richard",1940
3,"Harrison, George",1943


In [8]:
buf = io.StringIO()
df.to_csv(buf, index=False)
print(buf.getvalue())

name,birth
"McCartney, Paul",1942
"Lennon, John",1940
"Starkey, Richard",1940
"Harrison, George",1943



In [9]:
buf = io.StringIO()
df.to_csv(buf, index=False, sep="|")
print(buf.getvalue())

name|birth
McCartney, Paul|1942
Lennon, John|1940
Starkey, Richard|1940
Harrison, George|1943



In [10]:
df = pd.DataFrame({
    "col1": ["a"] * 1_000,
    "col2": ["b"] * 1_000,
    "col3": ["c"] * 1_000,
})
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df.head()

Unnamed: 0,col1,col2,col3
0,a,b,c
1,a,b,c
2,a,b,c
3,a,b,c
4,a,b,c


In [11]:
buf = io.StringIO()
df.to_csv(buf, index=False)
len(buf.getvalue())

6015

In [5]:
buf = io.BytesIO()
df.to_csv(buf, index=False, compression="gzip")
len(buf.getvalue())

114

## CSV - strategies for reading large files



### How to do it



In [9]:
import os
print(os.getcwd())

df = pd.read_csv("../data/diamonds.csv", dtype_backend="numpy_nullable", nrows=1_000)

df.head()

c:\python\Pandas-Cookbook-Third-Edition\Chapter04


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   Float64
 1   cut      1000 non-null   string 
 2   color    1000 non-null   string 
 3   clarity  1000 non-null   string 
 4   depth    1000 non-null   Float64
 5   table    1000 non-null   Float64
 6   price    1000 non-null   Int64  
 7   x        1000 non-null   Float64
 8   y        1000 non-null   Float64
 9   z        1000 non-null   Float64
dtypes: Float64(6), Int64(1), string(3)
memory usage: 85.1 KB


In [15]:
df["price"].describe()

count       1000.0
mean       2476.54
std      839.57562
min          326.0
25%         2777.0
50%         2818.0
75%         2856.0
max         2898.0
Name: price, dtype: Float64

In [16]:
df["carat"].describe()

count      1000.0
mean      0.68928
std      0.195291
min           0.2
25%           0.7
50%          0.71
75%          0.79
max          1.27
Name: carat, dtype: Float64

In [17]:
df2 = pd.read_csv(
    "data/diamonds.csv",
    nrows=1_000,
    dtype={
        "carat": pd.Float32Dtype(),
        "cut": pd.StringDtype(),
        "color": pd.StringDtype(),
        "clarity": pd.StringDtype(),
        "depth": pd.Float32Dtype(),
        "table": pd.Float32Dtype(),
        "price": pd.Int16Dtype(),
        "x": pd.Float32Dtype(),
        "y": pd.Float32Dtype(),
        "z": pd.Float32Dtype(),
    }
)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   Float32
 1   cut      1000 non-null   string 
 2   color    1000 non-null   string 
 3   clarity  1000 non-null   string 
 4   depth    1000 non-null   Float32
 5   table    1000 non-null   Float32
 6   price    1000 non-null   Int16  
 7   x        1000 non-null   Float32
 8   y        1000 non-null   Float32
 9   z        1000 non-null   Float32
dtypes: Float32(6), Int16(1), string(3)
memory usage: 55.8 KB


In [18]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.68928,61.7228,57.7347,2476.54,5.60594,5.59918,3.45753
std,0.195291,1.758879,2.467946,839.57562,0.625173,0.611974,0.389819
min,0.2,53.0,52.0,326.0,3.79,3.75,2.27
25%,0.7,60.9,56.0,2777.0,5.64,5.63,3.45
50%,0.71,61.8,57.0,2818.0,5.77,5.76,3.55
75%,0.79,62.6,59.0,2856.0,5.92,5.91,3.64
max,1.27,69.5,70.0,2898.0,7.12,7.05,4.33


In [19]:
df2.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.68928,61.722801,57.734699,2476.54,5.60594,5.59918,3.45753
std,0.195291,1.758879,2.467946,839.57562,0.625173,0.611974,0.389819
min,0.2,53.0,52.0,326.0,3.79,3.75,2.27
25%,0.7,60.900002,56.0,2777.0,5.64,5.63,3.45
50%,0.71,61.799999,57.0,2818.0,5.77,5.76,3.55
75%,0.79,62.599998,59.0,2856.0,5.92,5.91,3.64
max,1.27,69.5,70.0,2898.0,7.12,7.05,4.33


In [20]:
df2["cut"].unique()

<StringArray>
['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Length: 5, dtype: string

In [21]:
df2["color"].unique()

<StringArray>
['E', 'I', 'J', 'H', 'F', 'G', 'D']
Length: 7, dtype: string

In [22]:
df2["clarity"].unique()

<StringArray>
['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF']
Length: 8, dtype: string

In [23]:
df3 = pd.read_csv(
    "data/diamonds.csv",
    nrows=1_000,
    dtype={
        "carat": pd.Float32Dtype(),
        "cut": pd.StringDtype(),
        "color": pd.StringDtype(),
        "clarity": pd.StringDtype(),
        "depth": pd.Float32Dtype(),
        "table": pd.Float32Dtype(),
        "price": pd.Int16Dtype(),
        "x": pd.Float32Dtype(),
        "y": pd.Float32Dtype(),
        "z": pd.Float32Dtype(),
    }
)
cat_cols = ["cut", "color", "clarity"]
df3[cat_cols] = df3[cat_cols].astype(pd.CategoricalDtype())
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   Float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   Float32 
 5   table    1000 non-null   Float32 
 6   price    1000 non-null   Int16   
 7   x        1000 non-null   Float32 
 8   y        1000 non-null   Float32 
 9   z        1000 non-null   Float32 
dtypes: Float32(6), Int16(1), category(3)
memory usage: 36.2 KB


In [24]:
dtypes = {  # does not include x, y, or z
    "carat": pd.Float32Dtype(),
    "cut": pd.StringDtype(),
    "color": pd.StringDtype(),
    "clarity": pd.StringDtype(),
    "depth": pd.Float32Dtype(),
    "table": pd.Float32Dtype(),
    "price": pd.Int16Dtype(),
}
df4 = pd.read_csv(
    "data/diamonds.csv",
    nrows=1_000,
    dtype=dtypes,
    usecols=dtypes.keys(),
)
cat_cols = ["cut", "color", "clarity"]
df4[cat_cols] = df4[cat_cols].astype(pd.CategoricalDtype())
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   Float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   Float32 
 5   table    1000 non-null   Float32 
 6   price    1000 non-null   Int16   
dtypes: Float32(3), Int16(1), category(3)
memory usage: 21.5 KB


In [25]:
dtypes = {  # does not include x, y, or z
    "carat": pd.Float32Dtype(),
    "cut": pd.StringDtype(),
    "color": pd.StringDtype(),
    "clarity": pd.StringDtype(),
    "depth": pd.Float32Dtype(),
    "table": pd.Float32Dtype(),
    "price": pd.Int16Dtype(),
}
df_iter = pd.read_csv(
    "data/diamonds.csv",
    nrows=1_000,
    dtype=dtypes,
    usecols=dtypes.keys(),
    chunksize=200
)

for df in df_iter:
    cat_cols = ["cut", "color", "clarity"]
    df[cat_cols] = df[cat_cols].astype(pd.CategoricalDtype())
    print(f"processed chunk of shape {df.shape}")

processed chunk of shape (200, 7)
processed chunk of shape (200, 7)
processed chunk of shape (200, 7)
processed chunk of shape (200, 7)
processed chunk of shape (200, 7)


### There's more



In [26]:
def startswith_c(column_name: str) -> bool:
    return column_name.startswith("c")

pd.read_csv(
    "data/diamonds.csv",
    dtype_backend="numpy_nullable",
    usecols=startswith_c,
)

Unnamed: 0,carat,cut,color,clarity
0,0.23,Ideal,E,SI2
1,0.21,Premium,E,SI1
2,0.23,Good,E,VS1
3,0.29,Premium,I,VS2
4,0.31,Good,J,SI2
...,...,...,...,...
53935,0.72,Ideal,D,SI1
53936,0.72,Good,D,SI1
53937,0.7,Very Good,D,SI1
53938,0.86,Premium,H,SI2


## Microsoft Excel - basic reading / writing



### How to do it



In [27]:
df = pd.DataFrame([
    ["Paul", "McCartney", 1942],
    ["John", "Lennon", 1940],
    ["Richard", "Starkey", 1940],
    ["George", "Harrison", 1943],
], columns=["first", "last", "birth"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [28]:
import io
buf = io.BytesIO()
df.to_excel(buf)

In [29]:
buf.seek(0)
pd.read_excel(buf, dtype_backend="numpy_nullable")

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,McCartney,1942
1,1,John,Lennon,1940
2,2,Richard,Starkey,1940
3,3,George,Harrison,1943


In [30]:
buf.seek(0)
pd.read_excel(buf, dtype_backend="numpy_nullable", index_col=0)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [31]:
buf = io.BytesIO()
df.to_excel(buf, index=False)
buf.seek(0)
pd.read_excel(buf, dtype_backend="numpy_nullable")

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [32]:
buf.seek(0)
dtypes = {
    "first": pd.StringDtype(),
    "last": pd.StringDtype(),
    "birth": pd.Int16Dtype(),
}
df = pd.read_excel(buf, dtype=dtypes)
df.dtypes

first    string[python]
last     string[python]
birth             Int16
dtype: object

## Microsoft Excel - finding tables in non-default locations



### How to do it



In [33]:
pd.read_excel(
    "data/beatles.xlsx",
    dtype_backend="numpy_nullable",
    sheet_name="the_data",
    skiprows=4,
    usecols="C:E",
)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


### There's more&#x2026;



In [34]:
pd.read_excel(
    "data/beatles.xlsx",
    dtype_backend="numpy_nullable",
    sheet_name="the_data",
    skiprows=4,
    usecols=["first", "last", "birth"],
)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


## Microsoft Excel - hierarchical data



### How to do it



In [35]:
df = pd.read_excel(
    "data/hierarchical.xlsx",
    dtype_backend="numpy_nullable",
    index_col=[0, 1],
    header=[0, 1],
)
df

Unnamed: 0_level_0,Year,2024,2024,2025,2025
Unnamed: 0_level_1,Quarter,Q1,Q2,Q1,Q2
Region,Sub-Region,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
America,East,1,2,4,8
America,West,16,32,64,128
America,South,256,512,1024,4096
Europe,West,8192,16384,32768,65536
Europe,East,131072,262144,524288,1048576


In [36]:
df.loc[(slice(None), "East"), (slice(None), "Q2")]

Unnamed: 0_level_0,Year,2024,2025
Unnamed: 0_level_1,Quarter,Q2,Q2
Region,Sub-Region,Unnamed: 2_level_2,Unnamed: 3_level_2
America,East,2,8
Europe,East,262144,1048576


## SQL using SQLAlchemy



### How to do it



In [37]:
import sqlalchemy as sa
engine = sa.create_engine("sqlite:///:memory:")

In [38]:
df = pd.DataFrame([
    ["dog", 4],
    ["cat", 4],
], columns=["animal", "num_legs"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")

df.to_sql("table_name", engine, index=False)

2

In [39]:
pd.read_sql("table_name", engine, dtype_backend="numpy_nullable")

Unnamed: 0,animal,num_legs
0,dog,4
1,cat,4


In [40]:
pd.read_sql(
    "SELECT SUM(num_legs) AS total_legs FROM table_name",
    engine,
    dtype_backend="numpy_nullable"
)

Unnamed: 0,total_legs
0,8


In [41]:
df = pd.DataFrame([
    ["dog", 4],
    ["cat", 4],
    ["human", 2],
], columns=["animal", "num_legs"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df.to_sql("table_name", engine, index=False, if_exists="replace")

3

In [42]:
new_data = pd.DataFrame([["centipede", 100]], columns=["animal", "num_legs"])
new_data.to_sql("table_name", engine, index=False, if_exists="append")
pd.read_sql("table_name", engine, dtype_backend="numpy_nullable")

Unnamed: 0,animal,num_legs
0,dog,4
1,cat,4
2,human,2
3,centipede,100


## SQL using ADBC



### How to do it



In [43]:
from adbc_driver_sqlite import dbapi
df = pd.DataFrame([
    ["dog", 4],
    ["cat", 4],
    ["human", 2],
], columns=["animal", "num_legs"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,animal,num_legs
0,dog,4
1,cat,4
2,human,2


In [44]:
with dbapi.connect("file::memory:") as conn:
    df.to_sql("table_name", conn, index=False, if_exists="replace")
    df = pd.read_sql(
        "SELECT * FROM table_name",
        conn,
        dtype_backend="numpy_nullable",
    )
df

Unnamed: 0,animal,num_legs
0,dog,4
1,cat,4
2,human,2


In [45]:
import timeit

import sqlalchemy as sa

np.random.seed(42)
df = pd.DataFrame(
    np.random.randn(10_000, 10),
    columns=list("abcdefghij")
)

with sa.create_engine("sqlite:///:memory:").connect() as conn:
    func = lambda: df.to_sql("test_table", conn, if_exists="replace")
    print(timeit.timeit(func, number=100))

5.650135560994386


In [46]:
from adbc_driver_sqlite import dbapi

with dbapi.connect("file::memory:") as conn:
    func = lambda: df.to_sql("test_table", conn, if_exists="replace")
    print(timeit.timeit(func, number=100))

0.9668128940029419


### There's more&#x2026;



## Apache Parquet



### How to do it



In [47]:
import io
buf = io.BytesIO()
df = pd.DataFrame([
    ["Paul", "McCartney", 1942],
    ["John", "Lennon", 1940],
    ["Richard", "Starkey", 1940],
    ["George", "Harrison", 1943],
], columns=["first", "last", "birth"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [48]:
df.to_parquet(buf, index=False)

In [49]:
buf.seek(0)
pd.read_parquet(buf)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [50]:
df["birth"] = df["birth"].astype(pd.UInt16Dtype())
df.dtypes

first    string[python]
last     string[python]
birth            UInt16
dtype: object

In [51]:
buf = io.BytesIO()
df.to_parquet(buf, index=False)
buf.seek(0)
pd.read_parquet(buf).dtypes

first    string[python]
last     string[python]
birth            UInt16
dtype: object

In [52]:
suboptimal_df = pd.DataFrame([
    [0, "foo"],
    [1, "bar"],
    [2, "baz"],
], columns=["int_col", "str_col"])
buf = io.BytesIO()
suboptimal_df.to_parquet(buf, index=False)
buf.seek(0)
pd.read_parquet(buf, dtype_backend="numpy_nullable").dtypes

int_col             Int64
str_col    string[python]
dtype: object

In [53]:
pd.read_parquet(
    "data/partitions/2022/q1_sales.parquet",
)

Unnamed: 0,year,quarter,region,sales
0,2022,Q1,America,1
1,2022,Q1,Europe,2


In [54]:
pd.read_parquet("data/partitions/")

Unnamed: 0,year,quarter,region,sales
0,2022,Q1,America,1
1,2022,Q1,Europe,2
2,2022,Q2,America,4
3,2022,Q2,Europe,8
4,2023,Q1,America,16
5,2023,Q1,Europe,32
6,2023,Q2,America,64
7,2023,Q2,Europe,128


In [55]:
pd.read_parquet(
    "data/partitions/",
    filters=[("region", "==", "Europe")],
)

Unnamed: 0,year,quarter,region,sales
0,2022,Q1,Europe,2
1,2022,Q2,Europe,8
2,2023,Q1,Europe,32
3,2023,Q2,Europe,128


## JSON



In [56]:
import json
beatles = {
    "first": ["Paul", "John", "Richard", "George",],
    "last": ["McCartney", "Lennon", "Starkey", "Harrison",],
    "birth": [1942, 1940, 1940, 1943],
}
serialized = json.dumps(beatles)
print(f"serialized values are: {serialized}")
deserialized = json.loads(serialized)
print(f"deserialized values are: {deserialized}")

serialized values are: {"first": ["Paul", "John", "Richard", "George"], "last": ["McCartney", "Lennon", "Starkey", "Harrison"], "birth": [1942, 1940, 1940, 1943]}
deserialized values are: {'first': ['Paul', 'John', 'Richard', 'George'], 'last': ['McCartney', 'Lennon', 'Starkey', 'Harrison'], 'birth': [1942, 1940, 1940, 1943]}


### How to do it



In [57]:
import io
data = io.StringIO(serialized)
pd.read_json(data, dtype_backend="numpy_nullable")

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [58]:
df = pd.DataFrame(beatles)
print(df.to_json())

{"first":{"0":"Paul","1":"John","2":"Richard","3":"George"},"last":{"0":"McCartney","1":"Lennon","2":"Starkey","3":"Harrison"},"birth":{"0":1942,"1":1940,"2":1940,"3":1943}}


In [59]:
df = pd.DataFrame(beatles, index=["row 0", "row 1", "row 2", "row 3"])
df = df.convert_dtypes(dtype_backend="numpy_nullable")
df

Unnamed: 0,first,last,birth
row 0,Paul,McCartney,1942
row 1,John,Lennon,1940
row 2,Richard,Starkey,1940
row 3,George,Harrison,1943


In [60]:
serialized = df.to_json(orient="columns")
print(f'Length of orient="columns": {len(serialized)}')
serialized[:100]

Length of orient="columns": 221


'{"first":{"row 0":"Paul","row 1":"John","row 2":"Richard","row 3":"George"},"last":{"row 0":"McCartn'

In [61]:
pd.read_json(
    io.StringIO(serialized),
    orient="columns",
    dtype_backend="numpy_nullable"
)

Unnamed: 0,first,last,birth
row 0,Paul,McCartney,1942
row 1,John,Lennon,1940
row 2,Richard,Starkey,1940
row 3,George,Harrison,1943


In [62]:
serialized = df.to_json(orient="records")
print(f'Length of orient="records": {len(serialized)}')
serialized[:100]

Length of orient="records": 196


'[{"first":"Paul","last":"McCartney","birth":1942},{"first":"John","last":"Lennon","birth":1940},{"fi'

In [63]:
pd.read_json(
    io.StringIO(serialized),
    orient="orient",
    dtype_backend="numpy_nullable"
)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [64]:
serialized = df.to_json(orient="split")
print(f'Length of orient="split": {len(serialized)}')
serialized[:100]

Length of orient="split": 190


'{"columns":["first","last","birth"],"index":["row 0","row 1","row 2","row 3"],"data":[["Paul","McCar'

In [65]:
pd.read_json(
    io.StringIO(serialized),
    orient="split",
    dtype_backend="numpy_nullable",
)

Unnamed: 0,first,last,birth
row 0,Paul,McCartney,1942
row 1,John,Lennon,1940
row 2,Richard,Starkey,1940
row 3,George,Harrison,1943


In [66]:
serialized = df.to_json(orient="index")
print(f'Length of orient="index": {len(serialized)}')
serialized[:100]

Length of orient="index": 228


'{"row 0":{"first":"Paul","last":"McCartney","birth":1942},"row 1":{"first":"John","last":"Lennon","b'

In [67]:
pd.read_json(
    io.StringIO(serialized),
    orient="index",
    dtype_backend="numpy_nullable",
)

Unnamed: 0,first,last,birth
row 0,Paul,McCartney,1942
row 1,John,Lennon,1940
row 2,Richard,Starkey,1940
row 3,George,Harrison,1943


In [68]:
serialized = df.to_json(orient="values")
print(f'Length of orient="values": {len(serialized)}')
serialized[:100]

Length of orient="values": 104


'[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",19'

In [69]:
pd.read_json(
    io.StringIO(serialized),
    orient="values",
    dtype_backend="numpy_nullable",
)

Unnamed: 0,0,1,2
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [70]:
serialized = df.to_json(orient="table")
print(f'Length of orient="table": {len(serialized)}')
serialized[:100]

Length of orient="table": 524


'{"schema":{"fields":[{"name":"index","type":"string"},{"name":"first","type":"any","extDtype":"strin'

In [71]:
df["birth"] = df["birth"].astype(pd.UInt16Dtype())
serialized = df.to_json(orient="table")
pd.read_json(
    io.StringIO(serialized),
    orient="table",
).dtypes

first    string[python]
last     string[python]
birth            UInt16
dtype: object

### There's more



In [72]:
data = {
    "records": [{
        "name": "human",
        "characteristics": {
            "num_leg": 2,
            "num_eyes": 2
        }
    }, {
        "name": "dog",
        "characteristics": {
            "num_leg": 4,
            "num_eyes": 2
        }
    }, {
        "name": "horseshoe crab",
        "characteristics": {
            "num_leg": 10,
            "num_eyes": 10
        }
    }],
    "type": "animal",
    "pagination": {
        "next": "23978sdlkusdf97234u2io",
        "has_more": 1
    }
}

In [73]:
pd.json_normalize(
    data,
    record_path="records"
).convert_dtypes(dtype_backend="numpy_nullable")

Unnamed: 0,name,characteristics.num_leg,characteristics.num_eyes
0,human,2,2
1,dog,4,2
2,horseshoe crab,10,10


In [74]:
pd.json_normalize(
    data,
    record_path="records",
    meta="type"
).convert_dtypes(dtype_backend="numpy_nullable")

Unnamed: 0,name,characteristics.num_leg,characteristics.num_eyes,type
0,human,2,2,animal
1,dog,4,2,animal
2,horseshoe crab,10,10,animal


## HTML



### How to do it



In [75]:
url = "https://en.wikipedia.org/wiki/The_Beatles_discography"
dfs = pd.read_html(url, dtype_backend="numpy_nullable")
len(dfs)

46

In [76]:
dfs[0]

Unnamed: 0,The Beatles albums discography,The Beatles albums discography.1
0,"The Beatles members Ringo Starr, Paul McCartne...","The Beatles members Ringo Starr, Paul McCartne..."
1,Studio albums,"12 (UK), 17 (US)"
2,EPs,36
3,Live albums,5
4,Compilation albums,51
5,Mash-ups,2
6,Box sets,17


In [77]:
url = "https://en.wikipedia.org/wiki/The_Beatles_discography"
dfs = pd.read_html(
    url,
    match=r"List of studio albums",
    dtype_backend="numpy_nullable",
)
print(f"Number of tables returned was: {len(dfs)}")
dfs[0].head()

Number of tables returned was: 2


Unnamed: 0_level_0,Title,Album details[A],Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Certifications,Sales
Unnamed: 0_level_1,Title,Album details[A],UK [8][9],AUS [10],CAN [11],FRA [12],GER [13],NOR [14],US [15][16],Certifications,Sales
0,Please Please Me,Released: 22 March 1963 Label: Parlophone,1,—,—,5,5,—,155,BPI: Platinum[17] ARIA: Gold[18] MC: Gold[19] ...,
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,1,—,—,5,1,—,179,BPI: Gold[17] ARIA: Gold[18] BVMI: Gold[21] MC...,
2,A Hard Day's Night,Released: 10 July 1964 Label: Parlophone,1,1,—,—,1,—,—,BPI: Platinum[17] ARIA: Gold[18],
3,Beatles for Sale,Released: 4 December 1964 Label: Parlophone,1,1,—,—,1,—,—,BPI: Gold[17] ARIA: Gold[18] MC: Gold[19] RIAA...,"UK: 750,000[22]"
4,Help!,Released: 6 August 1965 Label: Parlophone,1,1,—,5,1,—,—,BPI: Platinum[17] ARIA: Gold[18],


In [78]:
url = "https://en.wikipedia.org/wiki/The_Beatles_discography"
dfs = pd.read_html(
    url,
    match="List of studio albums",
    header=1,
    dtype_backend="numpy_nullable",
)
dfs[0].head()

Unnamed: 0,Title,Album details[A],UK [8][9],AUS [10],CAN [11],FRA [12],GER [13],NOR [14],US [15][16],Certifications,Sales
0,Please Please Me,Released: 22 March 1963 Label: Parlophone,1,—,—,5,5,—,155,BPI: Platinum[17] ARIA: Gold[18] MC: Gold[19] ...,
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,1,—,—,5,1,—,179,BPI: Gold[17] ARIA: Gold[18] BVMI: Gold[21] MC...,
2,A Hard Day's Night,Released: 10 July 1964 Label: Parlophone,1,1,—,—,1,—,—,BPI: Platinum[17] ARIA: Gold[18],
3,Beatles for Sale,Released: 4 December 1964 Label: Parlophone,1,1,—,—,1,—,—,BPI: Gold[17] ARIA: Gold[18] MC: Gold[19] RIAA...,"UK: 750,000[22]"
4,Help!,Released: 6 August 1965 Label: Parlophone,1,1,—,5,1,—,—,BPI: Platinum[17] ARIA: Gold[18],


In [79]:
url = "https://en.wikipedia.org/wiki/The_Beatles_discography"
dfs = pd.read_html(
    url,
    match="List of studio albums",
    header=1,
    na_values=["—"],
    dtype_backend="numpy_nullable",
)
dfs[0].head()

Unnamed: 0,Title,Album details[A],UK [8][9],AUS [10],CAN [11],FRA [12],GER [13],NOR [14],US [15][16],Certifications,Sales
0,Please Please Me,Released: 22 March 1963 Label: Parlophone,1,,,5.0,5,,155.0,BPI: Platinum[17] ARIA: Gold[18] MC: Gold[19] ...,
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,1,,,5.0,1,,179.0,BPI: Gold[17] ARIA: Gold[18] BVMI: Gold[21] MC...,
2,A Hard Day's Night,Released: 10 July 1964 Label: Parlophone,1,1.0,,,1,,,BPI: Platinum[17] ARIA: Gold[18],
3,Beatles for Sale,Released: 4 December 1964 Label: Parlophone,1,1.0,,,1,,,BPI: Gold[17] ARIA: Gold[18] MC: Gold[19] RIAA...,"UK: 750,000[22]"
4,Help!,Released: 6 August 1965 Label: Parlophone,1,1.0,,5.0,1,,,BPI: Platinum[17] ARIA: Gold[18],


## Pickle



### How to do it



In [80]:
from collections import namedtuple

Member = namedtuple("Member", ["first", "last", "birth"])
ser = pd.Series([
    Member("Paul", "McCartney", 1942),
    Member("John", "Lennon", 1940),
    Member("Richard", "Starkey", 1940),
    Member("George", "Harrison", 1943),
])
ser

0     (Paul, McCartney, 1942)
1        (John, Lennon, 1940)
2    (Richard, Starkey, 1940)
3    (George, Harrison, 1943)
dtype: object

In [81]:
import io
buf = io.BytesIO()
ser.to_pickle(buf)

In [82]:
buf.seek(0)
ser = pd.read_pickle(buf)
ser

0     (Paul, McCartney, 1942)
1        (John, Lennon, 1940)
2    (Richard, Starkey, 1940)
3    (George, Harrison, 1943)
dtype: object

In [83]:
ser.iloc[0]

Member(first='Paul', last='McCartney', birth=1942)

## Third party I/O libraries

