In [1]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html

In [5]:
import pandas as pd
import numpy as np

In [None]:
def compress_sparse(series: pd.Series, threshold=0.5) -> pd.Series:
    number = len(series)
    
    nan = number - series.count()
    zero = (series == 0).sum()
    if nan / number > threshold:
        fill_value = np.nan
    elif zero / number > threshold:
        fill_value = 0
    else:
        return series
    
    return series.astype(pd.SparseDtype(series.dtype, fill_value))

In [75]:
df = pd.DataFrame(np.random.randn(10000, 4), dtype=np.float)

df.iloc[:5000] = 0

df

Unnamed: 0,0,1,2,3
0,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...
9995,0.205581,-0.996742,-0.696966,0.248243
9996,1.054894,0.478195,-0.985616,-1.480539
9997,-1.231538,-0.378591,-0.491060,-0.180245
9998,-0.355647,-0.140114,-0.170676,-0.988670


In [76]:
series = df[0]
print(series.memory_usage(deep=True))
new_series = compress_sparse(series)
print(new_series)
print(new_series.memory_usage(deep=True))

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
9995    0.205581
9996    1.054894
9997   -1.231538
9998   -0.355647
9999    0.620300
Name: 0, Length: 10000, dtype: float64
80128
0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
9995    0.205581
9996    1.054894
9997   -1.231538
9998   -0.355647
9999    0.620300
Name: 0, Length: 10000, dtype: Sparse[float64, 0]
12128


In [63]:
df.memory_usage(deep=True)

Index      128
0        80000
1        80000
2        80000
3        80000
dtype: int64

In [64]:
df.count()

0    9000
1    9000
2    9000
3    9000
dtype: int64

In [65]:
len(df)

10000

In [68]:
(df == 0).sum()

0    1000
1    1000
2    1000
3    1000
dtype: int64

In [47]:
sdf = df.astype(pd.SparseDtype("int", 0))

sdf.head()

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [42]:
sdf.dtypes

0    Sparse[int32, 0]
1    Sparse[int32, 0]
2    Sparse[int32, 0]
3    Sparse[int32, 0]
dtype: object

In [48]:
sdf.sparse.density

0.2883

In [49]:
sdf.memory_usage(deep=True)

Index      128
0        23320
1        23304
2        22976
3        22656
dtype: int64