# Convert datatypes

In [40]:
import pandas as pd

df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5.0, 6.0, 7.0, 8.0]})
df

Unnamed: 0,A,B
0,1,5.0
1,2,6.0
2,3,7.0
3,4,8.0


In [41]:
num_rows = 1000000
df_large = pd.concat([df] * (num_rows // len(df)), ignore_index=True)
df_large

Unnamed: 0,A,B
0,1,5.0
1,2,6.0
2,3,7.0
3,4,8.0
4,1,5.0
...,...,...
999995,4,8.0
999996,1,5.0
999997,2,6.0
999998,3,7.0


In [42]:
# Memory usage before conversion:
df_large.memory_usage().sum()

np.int64(16000132)

In [43]:
df_large["A"] = pd.to_numeric(df_large["A"], downcast="integer")
df_large["B"] = pd.to_numeric(df_large["B"], downcast="float")
df_large

Unnamed: 0,A,B
0,1,5.0
1,2,6.0
2,3,7.0
3,4,8.0
4,1,5.0
...,...,...
999995,4,8.0
999996,1,5.0
999997,2,6.0
999998,3,7.0


In [44]:
# Memory usage after conversion:
df_large.memory_usage().sum()

np.int64(5000132)

# Load less data

In [45]:
df = pd.DataFrame(
    {"A": range(1000), "B": range(1000), "C": range(1000), "D": range(1000)}
)
df

Unnamed: 0,A,B,C,D
0,0,0,0,0
1,1,1,1,1
2,2,2,2,2
3,3,3,3,3
4,4,4,4,4
...,...,...,...,...
995,995,995,995,995
996,996,996,996,996
997,997,997,997,997
998,998,998,998,998


In [46]:
df_subset = df[["A", "D"]]
df_subset

Unnamed: 0,A,D
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
995,995,995
996,996,996
997,997,997
998,998,998


# Sampling

In [47]:
df

Unnamed: 0,A,B,C,D
0,0,0,0,0
1,1,1,1,1
2,2,2,2,2
3,3,3,3,3
4,4,4,4,4
...,...,...,...,...
995,995,995,995,995
996,996,996,996,996
997,997,997,997,997
998,998,998,998,998


In [48]:
df_sample = df.sample(frac=0.1, random_state=42)
df_sample

Unnamed: 0,A,B,C,D
521,521,521,521,521
737,737,737,737,737
740,740,740,740,740
660,660,660,660,660
411,411,411,411,411
...,...,...,...,...
436,436,436,436,436
764,764,764,764,764
88,88,88,88,88
63,63,63,63,63


# Chunking

In [49]:
df = pd.DataFrame({"A": range(10000), "B": range(10000)})

In [50]:
chunk_size = 1000
for chunk in df.groupby(df.index // chunk_size):
    print(chunk)

(0,        A    B
0      0    0
1      1    1
2      2    2
3      3    3
4      4    4
..   ...  ...
995  995  995
996  996  996
997  997  997
998  998  998
999  999  999

[1000 rows x 2 columns])
(1,          A     B
1000  1000  1000
1001  1001  1001
1002  1002  1002
1003  1003  1003
1004  1004  1004
...    ...   ...
1995  1995  1995
1996  1996  1996
1997  1997  1997
1998  1998  1998
1999  1999  1999

[1000 rows x 2 columns])
(2,          A     B
2000  2000  2000
2001  2001  2001
2002  2002  2002
2003  2003  2003
2004  2004  2004
...    ...   ...
2995  2995  2995
2996  2996  2996
2997  2997  2997
2998  2998  2998
2999  2999  2999

[1000 rows x 2 columns])
(3,          A     B
3000  3000  3000
3001  3001  3001
3002  3002  3002
3003  3003  3003
3004  3004  3004
...    ...   ...
3995  3995  3995
3996  3996  3996
3997  3997  3997
3998  3998  3998
3999  3999  3999

[1000 rows x 2 columns])
(4,          A     B
4000  4000  4000
4001  4001  4001
4002  4002  4002
4003  4003  4003
4004  4004 

# Pandas dtypes

In [51]:
df = pd.DataFrame({
    "date_column": ["2022-01-01", "2022-01-02", "2022-01-03"],
    "numeric_column": [1.234, 2.345, 3.456],
})
df

Unnamed: 0,date_column,numeric_column
0,2022-01-01,1.234
1,2022-01-02,2.345
2,2022-01-03,3.456


In [52]:
df.dtypes

date_column        object
numeric_column    float64
dtype: object

In [53]:
df["date_column"] = pd.to_datetime(df["date_column"])
df["numeric_column"] = pd.to_numeric(df["numeric_column"], downcast="float")
df

Unnamed: 0,date_column,numeric_column
0,2022-01-01,1.234
1,2022-01-02,2.345
2,2022-01-03,3.456


In [54]:
df.dtypes

date_column       datetime64[ns]
numeric_column           float32
dtype: object

# Parallelize

In [61]:
%conda install dask

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/main/win-64/current_repodata.json HTTP/1.1" 304 0
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/r/win-64/current_repodata.json HTTP/1.1" 304 0
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/r/noarch/current_repodata.json HTTP/1.1" 304 0
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/main/noarch/current_repodata.json HTTP/1.1" 304 0
DEBUG:urll

In [62]:
%conda install -c conda-forge pyarrow=10

Collecting package metadata (current_repodata.json): ...working... done
Note: you may need to restart the kernel to use updated packages.


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): conda.anaconda.org:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): conda.anaconda.org:443
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/main/win-64/current_repodata.json HTTP/1.1" 304 0
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/r/noarch/current_repodata.json HTTP/1.1" 304 0
DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/msy


Solving environment: ...working... unsuccessful initial attempt using frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... unsuccessful initial attempt using frozen solve. Retrying with flexible solve.
Solving environment: ...working... 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
failed


In [63]:
df = pd.DataFrame({"A": range(10000), "B": range(10000)})
df

Unnamed: 0,A,B
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
9995,9995,9995
9996,9996,9996
9997,9997,9997
9998,9998,9998


In [64]:
import dask.dataframe as dd

ddf = dd.from_pandas(df, npartitions=4)
ddf

ImportError: pyarrow>=10.0.1 is required for PyArrow backed StringArray.

In [None]:
result = ddf.groupby("A").mean().compute()
result

NameError: name 'ddf' is not defined