# 20. How do I make my pandas DataFrame smaller and faster?

In [1]:
import pandas as pd
drinks = pd.read_csv('data/drinksbycountry.csv')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [2]:
# 'info()': Print a concise summary of a DataFrame.

# This method prints information about a DataFrame including
# the index dtype and column dtypes, non-null values and memory usage.
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 9.1+ KB


In [3]:
# To show exactly the memory used by the DataFrame, We pass 'deep' to the
# parameter 'memory_usage'.
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 30.4 KB


In [4]:
# 'memory_usage': Return the memory usage of each column in bytes.

# The memory usage can optionally include the contribution of
# the index and elements of `object` dtype.

# This value is displayed in `DataFrame.info` by default. This can be
# suppressed by setting ``pandas.options.display.memory_usage`` to False.

drinks.memory_usage(deep=True)

Index                              80
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [5]:
drinks.memory_usage(deep=True).sum()

31176

In [6]:
sorted(drinks['continent'].unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [7]:
# To minify the size of our data, we look to the labels that can be chage from 'object' to an 'int'
# or 'float'.
# In this case we want to change the continent dtype into an integer: 0 -> Africa, 1 -> Asia, ...
# Changing a column into category, not only reduce space, it make your data more faster.
drinks['continent'] = drinks['continent'].astype('category')

In [8]:
drinks.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [9]:
# You will notice that the continent columns look the same, but underneath the hood the data
# is stored as an 'int'.
drinks['continent'].head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [10]:
drinks['continent'].cat.codes.unique()

array([1, 2, 0, 3, 5, 4], dtype=int64)

In [11]:
# We notice that the size of the data has decrease.
drinks.memory_usage(deep=True).sum()

19588

In [12]:
# if we do the same for 'country' for example, we'll notice that the memory of our data has increased,
# because it create big number of categories.
drinks['country'] = drinks['country'].astype('category')
drinks.memory_usage(deep=True)

Index                              80
country                         18094
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [13]:
# BONUS:
# We can costomize a specific ordering type using category, passing the order we want as list 
# to the parameter CategoricalDtype. 
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({'ID':[100, 101, 102, 103], 'quality':['good', 'very good', 'good', 'excellent']})
df.sort_values('quality')

Unnamed: 0,ID,quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


In [14]:
# 'CategoricalDtype()': Type for categorical data with the categories and orderedness.
# we pass a list of items in the order we want them to be in.
cat_type = CategoricalDtype(['good', 'very good', 'excellent'], ordered=True)
df['quality'] = df['quality'].astype(cat_type)

In [15]:
df['quality']

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

In [16]:
# Now we can sort the DataFrame by quality.
df.sort_values('quality')

Unnamed: 0,ID,quality
0,100,good
2,102,good
1,101,very good
3,103,excellent


In [17]:
# we can also filter the DataFrame by quality.
df.loc[df['quality']>'good', :]

Unnamed: 0,ID,quality
1,101,very good
3,103,excellent
