https://pythonspeed.com/articles/pandas-load-less-data/

# Don’t load all the columns

In [None]:
import pandas as pd
df = pd.read_csv("voters.csv")
df.info(verbose=False, memory_usage="deep")         # memory usage: 71.2 MB


df = df[["First Name ", "Last Name "]]
df.info(verbose=False, memory_usage="deep")         # memory usage: 8.3 MB

In [None]:
df = pd.read_csv("voters.csv", usecols=["First Name ", "Last Name "])
df.info(verbose=False, memory_usage="deep")         # memory usage: 8.3 MB

# Shrink numerical columns with smaller dtypes

int8 can store integers from -128 to 127.<br>
int16 can store integers from -32768 to 32767.<br>
int64 can store integers from -9223372036854775808 to 9223372036854775807.

In [None]:
df = pd.read_csv("voters.csv")
df["Ward Number "].memory_usage(index=False, deep=True)
550688                               # current memory usege is this

df["Ward Number "].max()
11
df["Ward Number "].min()
1

df = pd.read_csv("voters.csv", dtype={"Ward Number ": "int8"})    # converted into int8
df["Ward Number "].memory_usage(index=False, deep=True)
68836

# Shrink categorical data using Categorical dtypes

some times our column have some specifc entries  (like gender column)

In [None]:
# all fields are char but but each of them is taking size of string

set(df["Party Affiliation "])
{'Q ', 'S ', 'L ', 'R ', 'H ', 'BB', 'D ', 'K ', 'O ', 'X ', 'A ', 'Z ', 'EE', 'F ', 'P ', 'G ', 'T ', 'CC', 'J ', 'AA', 'Y ', 'U '}


df["Party Affiliation "].memory_usage(index=False, deep=True)
4061324

In [None]:
df = pd.read_csv("voters.csv", dtype={"Party Affiliation ": "category"})
df["Party Affiliation "].head()
0    U 
1    U 
2    U 
3    R 
4    U 
Name: Party Affiliation , dtype: category
Categories (22, object): [A, AA, CC, D, ..., Y, Z, BB, EE]

    
df["Party Affiliation "].memory_usage(index=False, deep=True)
70774

https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html

# Sparse series

If you have a column with lots of empty values, usually represented as NaNs, you can save memory by using a sparse column representation. It won’t waste memory storing all those empty values.

In [None]:
df = pd.read_csv("voters.csv")
series = df["Mailing Address - Apartment Number "]
series.memory_usage(index=False, deep=True)
2623975

len(series)
68836

len(series.dropna())
13721

In [None]:
sparse_series = series.astype("Sparse[str]")
len(sparse_series)
68836

sparse_series.memory_usage(index=False, deep=True)
2237939

# IF READY TO LOOSE SOME DATA

## Changing numeric representations

we can change data type from      "flote64" to to "float32" 

In [None]:
data = [0.40, 0.56, 0.30, 0.85, 0.71, 0.89, 0.30, 0.24]
# we can save this data in percent format like this 
       [ 40 ,  56 ,  30 ,  85 ,  71 ,  89 ,  30 ,  24 ]

if we wand to use part of file to use, load that part only

<br>
use different functions to load different part of data so that after end of that function there variabls get destroid