# 11 Useful Pandas Methods You Might Have Overlooked

## Setup

In [2]:
import pandas as pd
import numpy as np

## `nth`

In [None]:
df = pd.DataFrame(
    data={
        "x": ["a", "a", "a", "b", "b", "b", "c"],
        "y": [np.NaN, 2, 3, 1, np.NaN, 3, np.NaN],
    }
)
df.groupby("x").nth(0)

Unnamed: 0_level_0,y
x,Unnamed: 1_level_1
a,
b,1.0
c,


In [None]:
df.groupby("x").nth([0, 2])

Unnamed: 0_level_0,y
x,Unnamed: 1_level_1
a,
a,3.0
b,1.0
b,3.0
c,


In [None]:
df.groupby("x").first()

Unnamed: 0_level_0,y
x,Unnamed: 1_level_1
a,2.0
b,1.0
c,


## `pop`

In [None]:
X = pd.DataFrame(
	data={
    	"x": ["a", "a", "a", "b", "b", "b", "c"],
    	"y": [np.NaN, 2, 3, 1, np.NaN, 3, np.NaN],
	}
)

y = X.pop("y")

print(X.shape)
print(y.shape)


(7, 1)
(7,)


## `compare`

In [None]:
a = pd.DataFrame(
	data={
    	"col_1": [1, 2, 3, 4],
    	"col_2": [5, 6, 7, 8],
	}
)

b = pd.DataFrame(
	data={
    	"col_1": [1, 2, 3, 9],
    	"col_2": [5, 6, 7, 8],
	}
)

a.compare(b)

Unnamed: 0_level_0,col_1,col_1
Unnamed: 0_level_1,self,other
3,4.0,9.0


In [None]:
a.compare(a)

In [None]:
a.compare(b, keep_shape=True)

Unnamed: 0_level_0,col_1,col_1,col_2,col_2
Unnamed: 0_level_1,self,other,self,other
0,,,,
1,,,,
2,,,,
3,4.0,9.0,,


In [None]:
a.compare(b, keep_shape=True, keep_equal=True)

Unnamed: 0_level_0,col_1,col_1,col_2,col_2
Unnamed: 0_level_1,self,other,self,other
0,1,1,5,5
1,2,2,6,6
2,3,3,7,7
3,4,9,8,8


## `align`

In [None]:
X = pd.DataFrame(
    data={
        "a": [1, 2, 3, 4, 5, 6],
        "b": [2, 3, 4, 5, 6, 7],
        "y": [3, 4, 5, 6, 7, 8],
    }
)

y = X.pop("y")
X = X.iloc[[0, 3, 5], :]
X

Unnamed: 0,a,b
0,1,2
3,4,5
5,6,7


In [None]:
y, X = y.align(X, join="inner")
X.index == y.index

array([ True,  True,  True])

## `to_markdown`

In [None]:
df = pd.DataFrame(
    data={
        "x": ["a", "a", "a", "b", "b", "b", "c"],
        "y": [np.NaN, 2, 3, 1, np.NaN, 3, np.NaN],
    }
)
print(df.to_markdown())

|    | x   |   y |
|---:|:----|----:|
|  0 | a   | nan |
|  1 | a   |   2 |
|  2 | a   |   3 |
|  3 | b   |   1 |
|  4 | b   | nan |
|  5 | b   |   3 |
|  6 | c   | nan |


In [None]:
print(df.to_markdown(tablefmt="latex"))

\begin{tabular}{rlr}
\hline
    & x   &   y \\
\hline
  0 & a   & nan \\
  1 & a   &   2 \\
  2 & a   &   3 \\
  3 & b   &   1 \\
  4 & b   & nan \\
  5 & b   &   3 \\
  6 & c   & nan \\
\hline
\end{tabular}


In [None]:
print(df.to_markdown(tablefmt="fancy_grid"))

╒════╤═════╤═════╕
│    │ x   │   y │
╞════╪═════╪═════╡
│  0 │ a   │ nan │
├────┼─────┼─────┤
│  1 │ a   │   2 │
├────┼─────┼─────┤
│  2 │ a   │   3 │
├────┼─────┼─────┤
│  3 │ b   │   1 │
├────┼─────┼─────┤
│  4 │ b   │ nan │
├────┼─────┼─────┤
│  5 │ b   │   3 │
├────┼─────┼─────┤
│  6 │ c   │ nan │
╘════╧═════╧═════╛


## `convert_dtypes`

In [None]:
df = pd.DataFrame(
    {
        "a": [1, 2, 3, 4, 5],
        "b": [1, 2, np.nan, 4, 5],
        "c": ["x", "y", np.nan, "x", "y"],
        "d": pd.Series([True, False, True, True, False], dtype="object"),
        "e": [np.nan, 100.5, 200, 200, 100],
        "f": ["a", "b", "c", "a", "c"],
    }
)
df

Unnamed: 0,a,b,c,d,e,f
0,1,1.0,x,True,,a
1,2,2.0,y,False,100.5,b
2,3,,,True,200.0,c
3,4,4.0,x,True,200.0,a
4,5,5.0,y,False,100.0,c


In [None]:
df.dtypes

a      int64
b    float64
c     object
d     object
e    float64
f     object
dtype: object

In [None]:
df_2 = df.convert_dtypes()
df_2

Unnamed: 0,a,b,c,d,e,f
0,1,1.0,x,True,,a
1,2,2.0,y,False,100.5,b
2,3,,,True,200.0,c
3,4,4.0,x,True,200.0,a
4,5,5.0,y,False,100.0,c


In [None]:
df_2.dtypes

a      Int64
b      Int64
c     string
d    boolean
e    Float64
f     string
dtype: object

In [None]:
df_3 = df.convert_dtypes(convert_boolean=False)
df_3

Unnamed: 0,a,b,c,d,e,f
0,1,1.0,x,1,,a
1,2,2.0,y,0,100.5,b
2,3,,,1,200.0,c
3,4,4.0,x,1,200.0,a
4,5,5.0,y,0,100.0,c


In [None]:
df_3.dtypes

a      Int64
b      Int64
c     string
d      Int64
e    Float64
f     string
dtype: object

## `CategoricalDtype`

In [None]:
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({
    "size": ["XL", "S", "M", "XS", "L"],
    "sales": [50, 10, 20, 90, 100]}
)

categories = CategoricalDtype(
    ["XS", "S", "M", "L", "XL"], 
    ordered=True
)

df["size"] = df["size"].astype(categories)
df

Unnamed: 0,size,sales
0,XL,50
1,S,10
2,M,20
3,XS,90
4,L,100


In [None]:
df.sort_values(by="size")

Unnamed: 0,size,sales
3,XS,90
1,S,10
2,M,20
4,L,100
0,XL,50


In [None]:
df[df["size"] > "M"]

Unnamed: 0,size,sales
0,XL,50
4,L,100


## `SparseDtype`

In [None]:
df = pd.DataFrame(np.random.randint(0, 100, size=(10000000, 5)))
df[df <= 90] = 0

In [None]:
def memory_usage(df):
    return(round(df.memory_usage(deep=True).sum() / 1024 ** 2, 2))

In [None]:
memory_usage(df)

381.47

In [None]:
df_1 = df.astype("uint8")
memory_usage(df_1)

47.68

In [None]:
(381.47 - 47.68)/ 381.47

0.8750098303929535

In [None]:
df_2 = df.astype(pd.SparseDtype("uint8", 0))
memory_usage(df_2)

21.45

In [None]:
(47.68 - 21.45)/ 47.68

0.5501258389261745

In [None]:
(381.47 - 21.45)/ 381.47

0.9437701523055548

## `crosstab`

In [None]:
N = 1000
df = pd.DataFrame({
    "group": np.random.choice(["AA", "BB"], N),
    "region": np.random.choice(["a", "b", "c"], N, p=[0.5, 0.3, 0.2]),
    "category": np.random.choice(["x", "y", "z"], N, p=[0.3, 0.3, 0.4]),
    "sales": np.random.normal(1000, 50, N)
})
df

Unnamed: 0,group,region,category,sales
0,BB,a,x,1033.471104
1,BB,a,x,997.237076
2,BB,a,x,1011.422801
3,BB,b,z,990.585378
4,AA,b,x,926.914578
...,...,...,...,...
995,AA,b,x,1002.886769
996,AA,a,y,1004.405438
997,BB,c,y,964.330562
998,AA,a,x,960.256207


In [None]:
pd.crosstab(df['region'], df['category'])

category,x,y,z
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,142,158,211
b,97,74,119
c,56,60,83


In [None]:
pd.crosstab(df['region'], [df["group"], df['category']])

group,AA,AA,AA,BB,BB,BB
category,x,y,z,x,y,z
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,79,70,115,63,88,96
b,48,40,50,49,34,69
c,25,32,39,31,28,44


In [None]:
pd.crosstab(df['region'], df['category'], margins=True)

category,x,y,z,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,142,158,211,511
b,97,74,119,290
c,56,60,83,199
All,295,292,413,1000


In [None]:
pd.crosstab(df['region'], df['category'], normalize=True)

category,x,y,z
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0.142,0.158,0.211
b,0.097,0.074,0.119
c,0.056,0.06,0.083


In [None]:
pd.crosstab(df['region'], df['category'], margins = True, normalize = True)

category,x,y,z,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,0.142,0.158,0.211,0.511
b,0.097,0.074,0.119,0.29
c,0.056,0.06,0.083,0.199
All,0.295,0.292,0.413,1.0


In [None]:
pd.crosstab(df['region'], df['category'], margins = True, normalize = "columns")

category,x,y,z,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,0.481356,0.541096,0.510896,0.511
b,0.328814,0.253425,0.288136,0.29
c,0.189831,0.205479,0.200969,0.199


In [None]:
pd.crosstab(df['region'], df['category'], margins = True, normalize = "index")

category,x,y,z
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0.277886,0.309198,0.412916
b,0.334483,0.255172,0.410345
c,0.281407,0.301508,0.417085
All,0.295,0.292,0.413


In [None]:
pd.crosstab(
    df["region"], 
    df["category"],
    values = df["sales"],
    aggfunc = "mean"
).round(2)

category,x,y,z
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1004.81,995.85,1001.18
b,996.97,991.5,999.7
c,993.64,998.95,992.47


## `swaplevel`

In [None]:
df_agg = pd.crosstab([df["group"], df['category']], df['region'])
df_agg

Unnamed: 0_level_0,region,a,b,c
group,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,x,79,48,25
AA,y,70,40,32
AA,z,115,50,39
BB,x,63,49,31
BB,y,88,34,28
BB,z,96,69,44


In [None]:
df_agg.swaplevel()

Unnamed: 0_level_0,region,a,b,c
category,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
x,AA,79,48,25
y,AA,70,40,32
z,AA,115,50,39
x,BB,63,49,31
y,BB,88,34,28
z,BB,96,69,44


## `resample`

In [3]:
df = pd.DataFrame(
    index=pd.date_range("2023-01-01", "2023-12-31")
)
df["value"] = list(range(len(df)))
df

Unnamed: 0,value
2023-01-01,0
2023-01-02,1
2023-01-03,2
2023-01-04,3
2023-01-05,4
...,...
2023-12-27,360
2023-12-28,361
2023-12-29,362
2023-12-30,363


In [4]:
df.resample("W").count().head()

Unnamed: 0,value
2023-01-01,1
2023-01-08,7
2023-01-15,7
2023-01-22,7
2023-01-29,7


In [5]:
df.resample("MS").sum()

Unnamed: 0,value
2023-01-01,465
2023-02-01,1246
2023-03-01,2294
2023-04-01,3135
2023-05-01,4185
2023-06-01,4965
2023-07-01,6076
2023-08-01,7037
2023-09-01,7725
2023-10-01,8928


In [7]:
df.resample("4M").max()

Unnamed: 0,value
2023-01-31,30
2023-05-31,150
2023-09-30,272
2024-01-31,364


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0fb8b8c3-20f4-42b6-8571-a2968c4d72c2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>