# Pandas: scaling to large datasets

In [1]:
import random
import string
import numpy as np
import pandas as pd
from datetime import datetime
import pathlib
%load_ext memory_profiler

Create a large dataset

In [2]:
%%time
def gen_random_string(length:int=32) -> str:
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))
    
def make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None):

    index = pd.date_range(start=start, end=end, freq=freq, name="timestamp")
    n = len(index)
    np.random.seed = seed
    columns = {
        'cat': np.random.choice(['cat1','cat2','cat3','cat4','cat5'],n),
        'str1':[gen_random_string() for _ in range(n)],
        'str2':[gen_random_string() for _ in range(n)],
        'a': np.random.rand(n),
        'b': np.random.rand(n),
        'c': np.random.randint(1,100,n),
    }

    df = pd.DataFrame(columns, index=index, columns=sorted(columns))
    if df.index[-1] == end:
        df = df.iloc[:-1]
    return df

timeseries = [
    make_timeseries(start=datetime(2020,1,1), end=datetime(2023,12,31), freq='1min', seed=10).rename(columns=lambda x: f"{x}_{i}")
    for i in range(5)
]
df = pd.concat(timeseries, axis=1)

CPU times: user 36 s, sys: 771 ms, total: 36.8 s
Wall time: 36.9 s


Print the fisrt rows to see what the data looks like.

In [3]:
df.head()

Unnamed: 0_level_0,a_0,b_0,c_0,cat_0,str1_0,str2_0,a_1,b_1,c_1,cat_1,...,c_3,cat_3,str1_3,str2_3,a_4,b_4,c_4,cat_4,str1_4,str2_4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01 00:00:00,0.854266,0.671644,44,cat1,6R1E999P0TL1SKCWNPXYUY5AITHFAXFZ,FHQZ2ZFD08BSR8G1P2NJWFP35JWDY8FJ,0.157047,0.929876,39,cat3,...,2,cat4,Q1T5OQB4KH234BAROWOSK9S6W47P6FL9,ORX2EACYKDOSMAYJB4GVMVMLRWPM4P78,0.709936,0.10304,54,cat4,DAUFFHO6XWU43XADFKZ59OYFO21J2WQD,DGT6A3NDB3DFLQOU4VAZ84ULVDNSCUBA
2020-01-01 00:01:00,0.344747,0.662463,3,cat4,72X3AHYFNXLX94EKYUV0MJX8RAAGJFDS,9A81L97QOZVCINW6AZBM2XA2X20Y1XG3,0.257334,0.157614,47,cat4,...,44,cat5,S4T95VXIVK8R3LWSBIGSAMY3ZYCWDL2N,LBSCWH7QRTGOWRK8JRIZ703Y6O2BLWKH,0.684248,0.405418,7,cat5,G81HNSPSOTYKOLNJF18B1TVR5SSL3ZMB,9SW42YS8MI7A64Q81XPYDA73GQ81T8ZN
2020-01-01 00:02:00,0.224605,0.389421,47,cat4,8MAS68NWXQJ9GWUJVN10I5BG45IBUCLU,S4Z2WBA8D9WD90KAIKQ020DEJZSUIJTE,0.114079,0.707929,15,cat2,...,95,cat4,S6VCF0K7J5KPB8D4UE4ZVYKVTKPHGJ51,W0F7BGV63ZBLJ0YHKM4B8QR90VOLRXW7,0.412482,0.760345,91,cat1,PD6KAX83I53Z9AU52K3FMY44HVPRDRGL,RUFF9P11YMY4GI0GVGYI9AIJC3CKAQDL
2020-01-01 00:03:00,0.572477,0.333768,34,cat2,8MD8U5ZR0SVA5P3SZ942QMFEPBOFM38A,JUYUCIXUUQ6TZCK90L42IJERQQ06SP5T,0.884055,0.848133,52,cat4,...,90,cat2,44480PC2Q6JX0O3ZLMA26F0A672ZSR21,M388UMRWI0N4RCLSD5RQHODGOA9W9U7N,0.081455,0.074969,87,cat1,P2XCEOAGV32B4D2CD2TJNHX5GWXCXFXY,GCVRLJOGU7TOAQSKCRIZG9ESC20KRJ5J
2020-01-01 00:04:00,0.343264,0.209907,18,cat1,A0BXOIK5D2TPLS72KUZGJGOD9YJZPWEB,GGW2WO1YW6JAZ5FFFWZHESX4HDHSP0N8,0.884601,0.136262,55,cat3,...,68,cat5,MYL3UKH2UD44O36SI1IG52DWIZ3U4H73,2RTZLWF2G5NHXKHDZSD2CZ2KB8ZQWH83,0.065517,0.118362,91,cat4,5CUFP6T50BYUAS92O7KX7O2XNXULAZES,ZV2XS7YEFJR5QFVI25H87H4WH4D1MUYJ


The method `info(memory_usage='deep')` returns the column types and also gives the memory usage of the dataframe.

In [4]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2102400 entries, 2020-01-01 00:00:00 to 2023-12-30 23:59:00
Freq: min
Data columns (total 30 columns):
 #   Column  Dtype  
---  ------  -----  
 0   a_0     float64
 1   b_0     float64
 2   c_0     int64  
 3   cat_0   object 
 4   str1_0  object 
 5   str2_0  object 
 6   a_1     float64
 7   b_1     float64
 8   c_1     int64  
 9   cat_1   object 
 10  str1_1  object 
 11  str2_1  object 
 12  a_2     float64
 13  b_2     float64
 14  c_2     int64  
 15  cat_2   object 
 16  str1_2  object 
 17  str2_2  object 
 18  a_3     float64
 19  b_3     float64
 20  c_3     int64  
 21  cat_3   object 
 22  str1_3  object 
 23  str2_3  object 
 24  a_4     float64
 25  b_4     float64
 26  c_4     int64  
 27  cat_4   object 
 28  str1_4  object 
 29  str2_4  object 
dtypes: float64(10), int64(5), object(15)
memory usage: 2.4 GB


Write the dataframe 

In [5]:
pathlib.Path("data").mkdir(parents=True,exist_ok=True)
df.to_parquet("timeseries.parquet")

## Load only useful data

Image that you are interested only by 

Imagine you're only interested in a subset of the dataset's columns `['a_0','a_1','cat_0','str1_0','str1_1']`. Then there are two ways to proceed: 
 * either load the entire dataset and then filter out the columns you're interested in
 * or read only the columns you're interested in

Compare the two loading methods.

Look at the `read_parquet`method

In [6]:
?pd.read_parquet

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mread_parquet[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m:[0m [0;34m'FilePath | ReadBuffer[bytes]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mengine[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0;34m'list[str] | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstorage_options[0m[0;34m:[0m [0;34m'StorageOptions | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_nullable_dtypes[0m[0;34m:[0m [0;34m'bool | lib.NoDefault'[0m [0;34m=[0m [0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype_backend[0m[0;34m:[0m [0;34m'DtypeBackend | lib.NoDefault'[0m [0;34m=[0m [0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilesystem[0m[0;34m:[0m [0;34m'Any'[0m [0;34m=[0m [0;32mNone[0m[

In [7]:
columns = ['a_0','a_1','cat_0','str1_0','str1_1']

**Option 1**: Load the entire dataset and then filter out the columns you're interested in

In [8]:
# TODO

In [9]:
%memit df_filter = pd.read_parquet("timeseries.parquet")[columns]

peak memory: 7442.13 MiB, increment: 3820.29 MiB


In [10]:
%%time
df_filter = pd.read_parquet("timeseries.parquet")[columns]
df_filter.head()

CPU times: user 5.33 s, sys: 3.23 s, total: 8.56 s
Wall time: 5.17 s


Unnamed: 0_level_0,a_0,a_1,cat_0,str1_0,str1_1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 00:00:00,0.854266,0.157047,cat1,6R1E999P0TL1SKCWNPXYUY5AITHFAXFZ,OZZKGDPVJ9EIGA818JFM9IRTOUSW863M
2020-01-01 00:01:00,0.344747,0.257334,cat4,72X3AHYFNXLX94EKYUV0MJX8RAAGJFDS,3NQ8PAHSRG8TQFJVYZHLY3KKQ5LXWO3P
2020-01-01 00:02:00,0.224605,0.114079,cat4,8MAS68NWXQJ9GWUJVN10I5BG45IBUCLU,PFARPVHAU0XL3UZYMAR2H00OBEMVMNEO
2020-01-01 00:03:00,0.572477,0.884055,cat2,8MD8U5ZR0SVA5P3SZ942QMFEPBOFM38A,9KDPQ6AZACNLCUQMTQUBJLRI5AK26GKY
2020-01-01 00:04:00,0.343264,0.884601,cat1,A0BXOIK5D2TPLS72KUZGJGOD9YJZPWEB,TB4M4CPKJPO35DTS0MQ525RJCK2KZTKS


**Option 2**: Read only the columns you're interested in. 

In [11]:
# TODO

In [12]:
%memit df_filter = pd.read_parquet("timeseries.parquet",columns=columns)

peak memory: 5967.68 MiB, increment: 754.82 MiB


In [13]:
%%time
df_filter = pd.read_parquet("timeseries.parquet",columns=columns)
df_filter.head()

CPU times: user 789 ms, sys: 529 ms, total: 1.32 s
Wall time: 917 ms


Unnamed: 0_level_0,a_0,a_1,cat_0,str1_0,str1_1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 00:00:00,0.854266,0.157047,cat1,6R1E999P0TL1SKCWNPXYUY5AITHFAXFZ,OZZKGDPVJ9EIGA818JFM9IRTOUSW863M
2020-01-01 00:01:00,0.344747,0.257334,cat4,72X3AHYFNXLX94EKYUV0MJX8RAAGJFDS,3NQ8PAHSRG8TQFJVYZHLY3KKQ5LXWO3P
2020-01-01 00:02:00,0.224605,0.114079,cat4,8MAS68NWXQJ9GWUJVN10I5BG45IBUCLU,PFARPVHAU0XL3UZYMAR2H00OBEMVMNEO
2020-01-01 00:03:00,0.572477,0.884055,cat2,8MD8U5ZR0SVA5P3SZ942QMFEPBOFM38A,9KDPQ6AZACNLCUQMTQUBJLRI5AK26GKY
2020-01-01 00:04:00,0.343264,0.884601,cat1,A0BXOIK5D2TPLS72KUZGJGOD9YJZPWEB,TB4M4CPKJPO35DTS0MQ525RJCK2KZTKS


You can use the magic command `%time` and `%memit` to compare the time and the memory usage of the two calls.

Not all the reading methods in Pandas has an option to read a subset of columns.

### Use efficient datatypes

The default pandas data types are not the most memory efficient. This is especially true for text data columns with relatively few unique values (commonly referred to as “low-cardinality” data). 

Using more efficient data types reduces the memory size of a dataframe, so you can store larger datasets in memory.

In [14]:
df = pd.read_parquet("timeseries.parquet",columns=['a_0','b_0','c_0','cat_0','str1_0','str2_0'])

Look at the data types of each column

In [15]:
df.dtypes

a_0       float64
b_0       float64
c_0         int64
cat_0      object
str1_0     object
str2_0     object
dtype: object

Look at the memory usage of the dataframe. The `memory_usage()` method returns the memory usage of each column in bytes.

In [16]:
df.memory_usage(deep=True)

Index      16819200
a_0        16819200
b_0        16819200
c_0        16819200
cat_0     111427200
str1_0    170294400
str2_0    170294400
dtype: int64

Compute the size of the dataframe. You should get the same result with the `info(memory_usage='deep')` method.

In [17]:
# TODO

In [18]:
mem = df.memory_usage(deep=True)
mem.sum()/1024/1024

np.float64(495.2362060546875)

In [19]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2102400 entries, 2020-01-01 00:00:00 to 2023-12-30 23:59:00
Data columns (total 6 columns):
 #   Column  Dtype  
---  ------  -----  
 0   a_0     float64
 1   b_0     float64
 2   c_0     int64  
 3   cat_0   object 
 4   str1_0  object 
 5   str2_0  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 495.2 MB


The result of `memory_usage` show that the columns taking up much more memory are 'str1_0','str2_0','cat_0'. It seems normal for 'str1_0','str2_0' columns because those columns contains random strings. But 'cat_0' column has just a few unique values, so it’s a good candidate for converting to a pandas.Categorical. With a pandas.Categorical, we store each unique name once and use space-efficient integers to know which specific name is used in each row.

First, we copy our dataframe to a new one.

In [20]:
df2 = df.copy()

Try to change to column type to Pandas.category using the `astype()` method

In [21]:
# TODO

In [22]:
df2["cat_0"] = df2["cat_0"].astype("category")

Check with dtypes that the column type has changed

In [23]:
# TODO

In [24]:
df2.dtypes

a_0        float64
b_0        float64
c_0          int64
cat_0     category
str1_0      object
str2_0      object
dtype: object

Compute the memory usage of each column for this new dataframe.

In [25]:
# TODO

In [26]:
df2.memory_usage(deep=True)

Index      16819200
a_0        16819200
b_0        16819200
c_0        16819200
cat_0       2102837
str1_0    170294400
str2_0    170294400
dtype: int64

We can go a bit further and downcast the numeric columns to their smallest types using pandas.to_numeric(). The "c_0" column contains number between 0 and 100. So it can be downcast to unsigned. If float precision is sufficient for columns 'a_0' et 'b_0', it is also possible to downcast to float. Be careful when you downcast, you lose precision and so you can propagate error during the processing.

In [27]:
# TODO

In [28]:
df2["c_0"] = pd.to_numeric(df2["c_0"], downcast="unsigned")
df2[["a_0", "b_0"]] = df2[["a_0", "b_0"]].apply(pd.to_numeric, downcast="float")

Check the types and the memory usage of the columns

In [29]:
# TODO

In [30]:
df2.dtypes

a_0        float32
b_0        float32
c_0          uint8
cat_0     category
str1_0      object
str2_0      object
dtype: object

In [31]:
df2.memory_usage(deep=True)

Index      16819200
a_0         8409600
b_0         8409600
c_0         2102400
cat_0       2102837
str1_0    170294400
str2_0    170294400
dtype: int64

Compute the memory reduction

In [32]:
# TODO

In [33]:
reduction = df2.memory_usage(deep=True).sum() / df.memory_usage(deep=True).sum()
print(f"{reduction:0.2f}")

0.73


# Use chunking

Some problem are embarrasingly parallel and so can be processed with chunking, which means by splitting a large problem into a bunch of small problems. 
For example, converting an big file into several smaller files and repeating the processing for each file in a directory. 
As long as each chunk fits in memory, you can work with datasets that are much larger than memory.

In [34]:
N = 12
starts = [f"20{i:>02d}-01-01" for i in range(N)]
ends = [f"20{i:>02d}-12-31" for i in range(N)]
pathlib.Path("data/timeseries").mkdir(parents=True,exist_ok=True)
for i, (start, end) in enumerate(zip(starts, ends)):
    ts = make_timeseries(start=start, end=end, freq="1min", seed=i)
    ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet")

Count the occurence of the values in the "c" column for all the files.

In [35]:
# TODO

In [36]:
%%time

files = pathlib.Path("data/timeseries/").glob("ts*.parquet")
counts = pd.Series(dtype=int)

for path in files:
    df = pd.read_parquet(path)
    counts = counts.add(df["c"].value_counts(), fill_value=0)

counts.astype(int)

CPU times: user 2.19 s, sys: 894 ms, total: 3.08 s
Wall time: 2.47 s


c
1     63728
2     63489
3     63196
4     63221
5     63105
      ...  
95    63325
96    63756
97    63388
98    63617
99    63683
Length: 99, dtype: int64

Some readers, like pandas.read_csv(), offer parameters to control the chunksize when reading a single file. 
In that case, it is possible to read a file chunk by chunk in order to process it.

In [37]:
df = make_timeseries(start="2023-01-01", end="2023-12-31", freq="1min", seed=10)
df.to_csv("data/timeseries.csv")

Try to count the occurence of the values in the "c" column for the CSV file by process it chunk by chunk. You need to use the parameter `chunksize` in the `read_csv`method. 

In [38]:
# TODO

In [39]:
counts = pd.Series(dtype=int)
with pd.read_csv("data/timeseries.csv",chunksize=1000) as reader:
    for chunk in reader:
        counts = counts.add(chunk["c"].value_counts(), fill_value=0)

counts.astype(int)

c
1     5284
2     5426
3     5400
4     5392
5     5207
      ... 
95    5243
96    5333
97    5287
98    5404
99    5335
Length: 99, dtype: int64

In [40]:
%%memit
counts = pd.Series(dtype=int)
with pd.read_csv("data/timeseries.csv",chunksize=1000) as reader:
    for chunk in reader:
        counts = counts.add(chunk["c"].value_counts(), fill_value=0)

counts.astype(int)

peak memory: 6483.00 MiB, increment: 0.00 MiB


In [41]:
%%memit
df = pd.read_csv("data/timeseries.csv")
df["c"].value_counts().astype(int)

peak memory: 6642.98 MiB, increment: 159.99 MiB
