In [1]:
import pandas as pd
import numpy as np

In [2]:
index = pd.date_range("2000-01-01", periods=8)
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])

In [3]:
df

Unnamed: 0,A,B,C
2000-01-01,-0.002829,-1.692554,0.852427
2000-01-02,-0.11541,-0.182124,0.657204
2000-01-03,0.986057,-0.721962,-0.116275
2000-01-04,-0.509161,-0.688499,-0.793598
2000-01-05,1.019723,1.841177,-0.365587
2000-01-06,-1.006188,-1.361479,-0.14083
2000-01-07,0.726349,-0.380965,0.436182
2000-01-08,1.118958,0.0876,0.850253


In [7]:
long_series = pd.Series(np.random.randn(1000))

In [8]:
long_series.head()

0    1.191629
1    0.208134
2   -1.986044
3   -0.491365
4   -0.172582
dtype: float64

In [9]:
long_series.tail(3)

997    0.201001
998    1.307864
999   -0.811038
dtype: float64

In [10]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,1.366167,0.523499,0.825231
2000-01-02,0.639035,1.889445,-1.043195


In [11]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
2000-01-01,1.366167,0.523499,0.825231
2000-01-02,0.639035,1.889445,-1.043195
2000-01-03,-1.19463,-0.016866,3.180865
2000-01-04,-0.746064,1.348725,0.002827
2000-01-05,-0.805664,1.235599,-0.871683
2000-01-06,0.585431,-0.910343,0.449982
2000-01-07,-0.110199,-1.443689,-1.657107
2000-01-08,-0.00285,-1.941527,-1.596266


In [12]:
s.array

<PandasArray>
[ 0.5258261669918581,  0.2937286098945143,  0.4139620255587831,
 -0.8211425490266008, -0.8592098211962584]
Length: 5, dtype: float64

In [13]:
s.index.array

<PandasArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [14]:
s.to_numpy()

array([ 0.52582617,  0.29372861,  0.41396203, -0.82114255, -0.85920982])

In [15]:
np.asarray(s)

array([ 0.52582617,  0.29372861,  0.41396203, -0.82114255, -0.85920982])

In [16]:
ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))
ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET')], dtype=object)

In [17]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [21]:
df.to_numpy()

array([[ 1.36616745e+00,  5.23499452e-01,  8.25230678e-01],
       [ 6.39035181e-01,  1.88944533e+00, -1.04319511e+00],
       [-1.19462983e+00, -1.68663677e-02,  3.18086471e+00],
       [-7.46063534e-01,  1.34872461e+00,  2.82724847e-03],
       [-8.05663935e-01,  1.23559946e+00, -8.71682680e-01],
       [ 5.85431160e-01, -9.10343081e-01,  4.49981721e-01],
       [-1.10199154e-01, -1.44368890e+00, -1.65710745e+00],
       [-2.85008397e-03, -1.94152729e+00, -1.59626569e+00]])

In [2]:
df = pd.DataFrame(
    {
        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
    }
)
df

Unnamed: 0,one,two,three
a,0.26731,-0.155084,
b,-0.277125,-0.360525,-1.056927
c,0.701973,1.079157,-0.621042
d,,-0.194711,-0.003709


In [5]:
row = df.iloc[1]
row

one      1.190047
two      0.957195
three    2.121506
Name: b, dtype: float64

In [6]:
column = df["two"]
column

a    0.048997
b    0.957195
c    0.588479
d    0.974571
Name: two, dtype: float64

In [7]:
df.sub(row, axis="columns")

Unnamed: 0,one,two,three
a,-1.730114,-0.908198,
b,0.0,0.0,0.0
c,-0.570109,-0.368716,-2.906507
d,,0.017376,-3.129463


In [37]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,-0.389835,-0.491962,
b,0.0,0.0,0.0
c,1.060337,-0.194208,0.830623
d,,-1.526401,1.009642


In [38]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,-1.015534,0.0,
b,-1.117662,0.0,-2.204822
c,0.136883,0.0,-1.179992
d,,0.0,0.331221


In [39]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,-1.015534,0.0,
b,-1.117662,0.0,-2.204822
c,0.136883,0.0,-1.179992
d,,0.0,0.331221


In [40]:
dfmi = df.copy()
dfmi.index = pd.MultiIndex.from_tuples(
    [(1, "a"), (1, "b"), (1, "c"), (2, "a")], names=["first", "second"]
)
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,-0.810801,0.204733,
1,b,-0.420966,0.696696,-1.508126
1,c,0.639371,0.502488,-0.677504
2,a,,-0.829705,-0.498484


In [42]:
dfmi.sub(column, axis=0, level="second")

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,-1.015534,0.0,
1,b,-1.117662,0.0,-2.204822
1,c,0.136883,0.0,-1.179992
2,a,,-1.034438,-0.703217


In [43]:
s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [44]:
div, rem = divmod(s, 3)

In [45]:
div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int64

In [46]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int64

In [47]:
idx = pd.Index(np.arange(10))
idx

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [48]:
div, rem = divmod(idx, 3)

In [49]:
div

Int64Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int64')

In [50]:
rem

Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int64')

In [52]:
div, rem = divmod(s, [2, 2, 3, 3, 4, 4, 5, 5, 6, 6])

In [53]:
div

0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int64

In [54]:
rem

0    0
1    1
2    2
3    0
4    0
5    1
6    1
7    2
8    2
9    3
dtype: int64

In [55]:
df

Unnamed: 0,one,two,three
a,-0.810801,0.204733,
b,-0.420966,0.696696,-1.508126
c,0.639371,0.502488,-0.677504
d,,-0.829705,-0.498484


In [13]:
df2 = df.copy()
df2["three"] = df2["three"].fillna(1.0)
df2

Unnamed: 0,one,two,three
a,0.26731,-0.155084,1.0
b,-0.277125,-0.360525,-1.056927
c,0.701973,1.079157,-0.621042
d,,-0.194711,-0.003709


In [68]:
df + df2

Unnamed: 0,one,two,three
a,-1.621601,0.409466,
b,-0.841932,1.393391,-3.016253
c,1.278742,1.004976,-1.355007
d,,-1.65941,-0.996968


In [73]:
# Fill 0 to df's NaN
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,-1.621601,0.409466,1.0
b,-0.841932,1.393391,-3.016253
c,1.278742,1.004976,-1.355007
d,,-1.65941,-0.996968


In [74]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [75]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [76]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [80]:
(df > 0).any()

one       True
two       True
three    False
dtype: bool

In [84]:
(df > 0).any().any()

True

In [85]:
df.empty

False

In [87]:
pd.DataFrame(columns=list("ABC")).empty

True

In [88]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [89]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [90]:
np.nan == np.nan

False

In [91]:
(df + df).equals(df * 2)

True

In [92]:
df1 = pd.DataFrame({"col": ["foo", 0, np.nan]})
df1

Unnamed: 0,col
0,foo
1,0
2,


In [93]:
df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0])
df2

Unnamed: 0,col
2,
1,0
0,foo


In [95]:
df1.equals(df2)

False

In [96]:
df1.equals(df2.sort_index())

True

In [98]:
pd.Series(["foo", "bar", "baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [99]:
pd.Index(["foo", "bar", "baz"]) == "foo"

array([ True, False, False])

In [100]:
pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [102]:
# Trying to compare Index or Series objects of different lengths will raise a ValueError
pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [103]:
np.array([1, 2, 3]) == np.array([2])

array([False,  True, False])

In [106]:
np.array([1, 2, 3]) == np.array([1, 2])

  np.array([1, 2, 3]) == np.array([1, 2])


False

In [107]:
df1 = pd.DataFrame(
    {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
)


df2 = pd.DataFrame(
    {
        "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
        "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
    }
)

In [108]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [109]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [110]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


In [111]:
def combiner(x, y):
    return np.where(pd.isna(x), y, x)

df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


In [113]:
df.mean(0)

one     -0.197465
two      0.143553
three   -0.894705
dtype: float64

In [114]:
df.mean(1)

a   -0.303034
b   -0.410799
c    0.154785
d   -0.664094
dtype: float64

In [115]:
df.sum(0, skipna=False)

one           NaN
two      0.574212
three         NaN
dtype: float64

In [116]:
df.sum(axis=1, skipna=True)

a   -0.606068
b   -1.232397
c    0.464355
d   -1.328189
dtype: float64

In [120]:
ts_stand = (df - df.mean()) / df.std()
ts_stand

Unnamed: 0,one,two,three
a,-0.817262,0.090017,
b,-0.297812,0.813863,-1.138651
c,1.115074,0.528117,0.403174
d,,-1.431997,0.735476


In [119]:
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

In [124]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)
xs_stand

Unnamed: 0,one,two,three
a,-0.707107,0.707107,
b,-0.009222,1.004579,-0.995357
c,0.669294,0.480236,-1.14953
d,,-0.707107,0.707107


In [126]:
xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

In [8]:
df.cumsum()

Unnamed: 0,one,two,three
a,-0.540066,0.048997,
b,0.649981,1.006192,2.121506
c,1.269919,1.594671,1.336505
d,,2.569242,0.328547


In [9]:
np.mean(df["one"])

0.42330644912981236

In [11]:
np.mean(df["one"].to_numpy())

nan

In [15]:
series = pd.Series(np.random.randn(500))
series[20:500] = np.nan
series[10:20] = 5
series

0      1.008167
1     -0.357753
2      0.789963
3     -0.123419
4     -0.729559
         ...   
495         NaN
496         NaN
497         NaN
498         NaN
499         NaN
Length: 500, dtype: float64

In [13]:
series.nunique()

11

In [16]:
series = pd.Series(np.random.randn(1000))
series

0     -0.368786
1     -0.338375
2     -2.060840
3      1.429603
4     -0.492523
         ...   
995   -0.283425
996    0.580248
997   -0.514491
998    0.456198
999   -1.291802
Length: 1000, dtype: float64

In [17]:
series[::2] = np.nan
series

0           NaN
1     -0.338375
2           NaN
3      1.429603
4           NaN
         ...   
995   -0.283425
996         NaN
997   -0.514491
998         NaN
999   -1.291802
Length: 1000, dtype: float64

In [19]:
series.describe()

count    500.000000
mean      -0.027825
std        1.058667
min       -3.638832
25%       -0.743970
50%       -0.014372
75%        0.648840
max        3.745826
dtype: float64

In [20]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"])
frame.iloc[::2] = np.nan
frame

Unnamed: 0,a,b,c,d,e
0,,,,,
1,-0.617318,-0.302644,0.211979,1.605303,0.299476
2,,,,,
3,0.541672,0.541521,1.284402,0.262259,-0.711248
4,,,,,
...,...,...,...,...,...
995,0.037218,0.257484,0.717048,-1.212263,1.775367
996,,,,,
997,-0.945999,0.839212,1.417866,-0.692126,0.719172
998,,,,,


In [21]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,-0.04567,0.095824,0.00211,-0.031497,0.022232
std,1.035827,0.962578,1.023269,0.997566,0.984461
min,-3.092032,-2.770135,-3.372082,-3.475272,-2.909579
25%,-0.759708,-0.563228,-0.746362,-0.719054,-0.632752
50%,-0.050865,0.15295,0.026079,-0.053476,0.026856
75%,0.614389,0.697814,0.744785,0.617348,0.680644
max,2.901256,3.316428,2.981892,2.822832,3.053538


In [23]:
series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])

count    500.000000
mean      -0.027825
std        1.058667
min       -3.638832
5%        -1.813087
25%       -0.743970
50%       -0.014372
75%        0.648840
95%        1.749538
max        3.745826
dtype: float64

In [25]:
s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [26]:
frame = pd.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)})
frame

Unnamed: 0,a,b
0,Yes,0
1,Yes,1
2,No,2
3,No,3


In [27]:
frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [28]:
frame.describe(include=["object"])

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [32]:
# same then describe()
frame.describe(include=["number"])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [33]:
frame.describe(include="all")

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


In [35]:
s1 = pd.Series(np.random.randn(5))
s1

0    1.279049
1   -1.574451
2    0.343426
3   -1.188270
4    1.334404
dtype: float64

In [36]:
s1.idxmin(), s1.idxmax()

(1, 4)

In [37]:
df1 = pd.DataFrame(np.random.rand(5, 3), columns=["A", "B", "C"])
df1

Unnamed: 0,A,B,C
0,0.075371,0.639129,0.257608
1,0.798764,0.589435,0.410403
2,0.158976,0.754243,0.246718
3,0.063774,0.031912,0.562732
4,0.687316,0.745642,0.498192


In [38]:
df1.idxmin(axis=0)

A    3
B    3
C    2
dtype: int64

In [39]:
df1.idxmax(axis=1)

0    B
1    A
2    B
3    C
4    B
dtype: object

In [40]:
df3 =pd.DataFrame([2, 1, 1, 3, np.nan], columns=["A"], index=list("edcba"))
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [41]:
df3["A"].idxmin()

'd'

In [42]:
data = np.random.randint(0, 7, size=50)
data

array([2, 1, 0, 4, 6, 6, 3, 1, 0, 6, 6, 4, 6, 0, 1, 6, 5, 2, 3, 5, 2, 3,
       0, 6, 0, 5, 5, 6, 6, 5, 6, 5, 3, 6, 6, 3, 1, 1, 4, 0, 3, 0, 2, 0,
       2, 0, 0, 5, 4, 5])

In [43]:
s = pd.Series(data)
s.value_counts()

6    12
0    10
5     8
3     6
2     5
1     5
4     4
dtype: int64

In [44]:
pd.value_counts(data)

6    12
0    10
5     8
3     6
2     5
1     5
4     4
dtype: int64

In [45]:
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}
frame = pd.DataFrame(data)
frame.value_counts()

a  b
1  x    1
2  x    1
3  y    1
4  y    1
dtype: int64

In [48]:
# et the most frequently occurring value(s)
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()

0    3
1    7
dtype: int64

In [49]:
df5 = pd.DataFrame(
    {
        "A": np.random.randint(0, 7, size=50),
        "B": np.random.randint(-10, 15, size=50),
    }
)
df5.mode()

Unnamed: 0,A,B
0,6.0,-8
1,,3
2,,12


In [51]:
arr = np.random.randn(20)
factor = pd.cut(arr, 4)
factor

[(-0.729, 0.317], (0.317, 1.363], (-1.78, -0.729], (0.317, 1.363], (-1.78, -0.729], ..., (-1.78, -0.729], (-0.729, 0.317], (-0.729, 0.317], (0.317, 1.363], (-0.729, 0.317]]
Length: 20
Categories (4, interval[float64, right]): [(-1.78, -0.729] < (-0.729, 0.317] < (0.317, 1.363] < (1.363, 2.409]]

In [53]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])
factor

[(-1, 0], (0, 1], (-5, -1], (0, 1], (-5, -1], ..., (-1, 0], (0, 1], (-1, 0], (1, 5], (0, 1]]
Length: 20
Categories (4, interval[int64, right]): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [54]:
arr = np.random.randn(30)
factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1])
factor

[(-1.952, -0.373], (-1.952, -0.373], (0.347, 2.009], (0.347, 2.009], (0.347, 2.009], ..., (-1.952, -0.373], (0.347, 2.009], (-0.0858, 0.347], (0.347, 2.009], (-1.952, -0.373]]
Length: 30
Categories (4, interval[float64, right]): [(-1.952, -0.373] < (-0.373, -0.0858] < (-0.0858, 0.347] < (0.347, 2.009]]

In [55]:
pd.value_counts(factor)

(-1.952, -0.373]     8
(0.347, 2.009]       8
(-0.373, -0.0858]    7
(-0.0858, 0.347]     7
dtype: int64

In [57]:
arr = np.random.randn(20)
factor = pd.cut(arr, [-np.inf, 0, np.inf])
factor

[(0.0, inf], (-inf, 0.0], (-inf, 0.0], (-inf, 0.0], (0.0, inf], ..., (-inf, 0.0], (-inf, 0.0], (-inf, 0.0], (-inf, 0.0], (0.0, inf]]
Length: 20
Categories (2, interval[float64, right]): [(-inf, 0.0] < (0.0, inf]]

In [7]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df

def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df

df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})

df_p

Unnamed: 0,city_and_code
0,"Chicago, IL"


In [8]:
add_country_name(extract_city_name(df_p), country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [13]:
# equivalent to
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [14]:
import statsmodels.formula.api as sm

In [16]:
bb = pd.read_csv("data/baseball.csv", index_col="id")
bb

Unnamed: 0_level_0,player,year,stint,team,lg,g,ab,r,h,X2b,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
88641,womacto01,2006,2,CHN,NL,19,50,6,14,1,...,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0
88643,schilcu01,2006,1,BOS,AL,31,2,0,1,0,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
88645,myersmi01,2006,1,NYA,AL,62,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
88649,helliri01,2006,1,MIL,NL,20,3,0,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
88650,johnsra05,2006,1,NYA,AL,33,6,0,1,0,...,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89525,benitar01,2007,2,FLO,NL,34,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
89526,benitar01,2007,1,SFN,NL,19,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,16,...,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0
89533,aloumo01,2007,1,NYN,NL,87,328,51,112,19,...,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0


In [36]:
# TypeError: cannot convert the series to <class 'float'>
(
    bb.query("h > 0")
    .assign(ln_h=lambda df: np.long(df.h))
    .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)")
    .fit()
    .summary()
)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  .assign(ln_h=lambda df: np.long(df.h))


TypeError: cannot convert the series to <class 'int'>

In [40]:
df

Unnamed: 0,one,two,three
a,0.568993,0.612676,
b,1.135524,0.13708,-0.363783
c,-0.952731,-0.702573,-1.1998
d,,-0.940072,-0.503477


In [38]:
df.apply(np.mean)

one      0.250595
two     -0.223222
three   -0.689020
dtype: float64

In [39]:
df.apply(np.mean, axis=1)

a    0.590835
b    0.302940
c   -0.951701
d   -0.721774
dtype: float64

In [42]:
df.apply(lambda x: x.max() - x.min())

one      2.088255
two      1.552748
three    0.836017
dtype: float64

In [43]:
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,0.568993,0.612676,
b,1.704517,0.749756,-0.363783
c,0.751785,0.047183,-1.563583
d,,-0.892889,-2.067059


In [44]:
df.apply(np.exp)

Unnamed: 0,one,two,three
a,1.766488,1.845363,
b,3.112803,1.14692,0.695042
c,0.385686,0.495309,0.301255
d,,0.3906,0.604425


In [45]:
df.apply("mean")

one      0.250595
two     -0.223222
three   -0.689020
dtype: float64

In [46]:
df.apply("mean", axis=1)

a    0.590835
b    0.302940
c   -0.951701
d   -0.721774
dtype: float64

In [47]:
tsdf = pd.DataFrame(
    np.random.randn(1000, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("1/1/2000", periods=1000),
)
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.731615,0.260175,-1.645775
2000-01-02,0.358248,0.672934,-1.165767
2000-01-03,0.599483,0.340191,-1.606369
2000-01-04,-0.447228,-0.619150,-0.746472
2000-01-05,0.380651,0.620072,-0.145698
...,...,...,...
2002-09-22,-0.565984,0.283167,-0.431209
2002-09-23,0.139395,0.318289,1.515953
2002-09-24,0.290673,0.398629,-0.692182
2002-09-25,1.139847,-0.610552,-0.733604


In [48]:
tsdf.apply(lambda x: x.idxmax())

A   2001-03-22
B   2002-03-02
C   2000-01-11
dtype: datetime64[ns]

In [49]:
def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

In [50]:
df.apply(subtract_and_divide, args=(5,), divide=3)

Unnamed: 0,one,two,three
a,-1.477002,-1.462441,
b,-1.288159,-1.620973,-1.787928
c,-1.984244,-1.900858,-2.0666
d,,-1.980024,-1.834492


In [52]:
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("2021-01-01", periods=10),
)
tsdf

Unnamed: 0,A,B,C
2021-01-01,-0.233102,-2.231653,1.365148
2021-01-02,0.550046,-0.564479,-0.68493
2021-01-03,0.81401,-0.792941,-0.064929
2021-01-04,-0.898401,-1.008068,-1.63427
2021-01-05,-0.183274,-0.645887,-0.520968
2021-01-06,-0.376074,1.019671,-0.860777
2021-01-07,-0.584339,0.987945,0.099723
2021-01-08,-0.576051,-1.031796,-0.042856
2021-01-09,-0.996406,-0.910311,1.592586
2021-01-10,-0.363847,0.11265,-0.981771


In [54]:
tsdf.iloc[3:7] = np.nan
tsdf

Unnamed: 0,A,B,C
2021-01-01,-0.233102,-2.231653,1.365148
2021-01-02,0.550046,-0.564479,-0.68493
2021-01-03,0.81401,-0.792941,-0.064929
2021-01-04,,,
2021-01-05,,,
2021-01-06,,,
2021-01-07,,,
2021-01-08,-0.576051,-1.031796,-0.042856
2021-01-09,-0.996406,-0.910311,1.592586
2021-01-10,-0.363847,0.11265,-0.981771


In [56]:
tsdf.agg(np.sum)

A   -0.805350
B   -5.418529
C    1.183247
dtype: float64

In [57]:
tsdf.agg("sum")

A   -0.805350
B   -5.418529
C    1.183247
dtype: float64

In [58]:
tsdf.sum()

A   -0.805350
B   -5.418529
C    1.183247
dtype: float64

In [59]:
# Aggregation with multiple function
tsdf.agg(["sum"])

Unnamed: 0,A,B,C
sum,-0.80535,-5.418529,1.183247


In [60]:
tsdf.agg(["sum", "mean"])

Unnamed: 0,A,B,C
sum,-0.80535,-5.418529,1.183247
mean,-0.134225,-0.903088,0.197208


In [61]:
tsdf["A"].agg(["sum", "mean"])

sum    -0.805350
mean   -0.134225
Name: A, dtype: float64

In [62]:
tsdf["A"].agg(["sum", lambda x: x.mean()])

sum        -0.805350
<lambda>   -0.134225
Name: A, dtype: float64

In [63]:
def mymean(x):
    return x.mean()

In [64]:
tsdf["A"].agg(["sum", mymean])

sum      -0.805350
mymean   -0.134225
Name: A, dtype: float64

In [65]:
# Aggregation with dict
tsdf.agg({"A": "mean", "B": "sum"})

A   -0.134225
B   -5.418529
dtype: float64

In [66]:
tsdf.agg({"A": ["mean", "min"], "B": "sum"})

Unnamed: 0,A,B
mean,-0.134225,
min,-0.996406,
sum,,-5.418529


In [67]:
# Mixed dtypes
mdf = pd.DataFrame(
    {
        "A": [1, 2, 3],
        "B": [1.0, 2.0, 3.0],
        "C": ["foo", "bar", "baz"],
        "D": pd.date_range("20130101", periods=3),
    }
)
mdf.dtypes

A             int64
B           float64
C            object
D    datetime64[ns]
dtype: object

In [68]:
mdf.agg(["min", "sum"])

Unnamed: 0,A,B,C,D
min,1,1.0,bar,2013-01-01
sum,6,6.0,foobarbaz,NaT


In [69]:
# Custom describe
from functools import partial

In [70]:
q_25 = partial(pd.Series.quantile, q=0.25)
q_25.__name__ = "25%"
q_75 = partial(pd.Series.quantile, q=0.75)
q_75.__name__ = "75%"
tsdf.agg(["count", "mean", "std", "min", q_25, "median", q_75, "max"])

Unnamed: 0,A,B,C
count,6.0,6.0,6.0
mean,-0.134225,-0.903088,0.197208
std,0.688251,0.76684,1.058836
min,-0.996406,-2.231653,-0.981771
25%,-0.523,-1.001425,-0.52993
median,-0.298475,-0.851626,-0.053893
75%,0.354259,-0.621594,1.013147
max,0.81401,0.11265,1.592586


# Transform API

In [71]:
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("1/1/2000", periods=10),
)
tsdf.iloc[3:7] = np.nan
tsdf

Unnamed: 0,A,B,C
2000-01-01,1.335675,0.280823,-2.21404
2000-01-02,-0.653946,0.87182,1.611063
2000-01-03,1.478118,-0.324888,0.261084
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,0.596282,0.101502,-1.027672
2000-01-09,1.263444,-0.650917,0.720393
2000-01-10,-0.128905,0.38411,-0.218166


In [72]:
tsdf.transform(np.abs)

Unnamed: 0,A,B,C
2000-01-01,1.335675,0.280823,2.21404
2000-01-02,0.653946,0.87182,1.611063
2000-01-03,1.478118,0.324888,0.261084
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,0.596282,0.101502,1.027672
2000-01-09,1.263444,0.650917,0.720393
2000-01-10,0.128905,0.38411,0.218166


In [73]:
tsdf.transform("abs")

Unnamed: 0,A,B,C
2000-01-01,1.335675,0.280823,2.21404
2000-01-02,0.653946,0.87182,1.611063
2000-01-03,1.478118,0.324888,0.261084
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,0.596282,0.101502,1.027672
2000-01-09,1.263444,0.650917,0.720393
2000-01-10,0.128905,0.38411,0.218166


In [74]:
tsdf.transform(lambda x: x.abs())

Unnamed: 0,A,B,C
2000-01-01,1.335675,0.280823,2.21404
2000-01-02,0.653946,0.87182,1.611063
2000-01-03,1.478118,0.324888,0.261084
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,0.596282,0.101502,1.027672
2000-01-09,1.263444,0.650917,0.720393
2000-01-10,0.128905,0.38411,0.218166


In [75]:
np.abs(tsdf)

Unnamed: 0,A,B,C
2000-01-01,1.335675,0.280823,2.21404
2000-01-02,0.653946,0.87182,1.611063
2000-01-03,1.478118,0.324888,0.261084
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,0.596282,0.101502,1.027672
2000-01-09,1.263444,0.650917,0.720393
2000-01-10,0.128905,0.38411,0.218166


In [77]:
tsdf["A"].transform(np.abs)

2000-01-01    1.335675
2000-01-02    0.653946
2000-01-03    1.478118
2000-01-04         NaN
2000-01-05         NaN
2000-01-06         NaN
2000-01-07         NaN
2000-01-08    0.596282
2000-01-09    1.263444
2000-01-10    0.128905
Freq: D, Name: A, dtype: float64

# Transform with multiple functions

In [79]:
tsdf.transform([np.abs, lambda x: x + 1])

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>
2000-01-01,1.335675,2.335675,0.280823,1.280823,2.21404,-1.21404
2000-01-02,0.653946,0.346054,0.87182,1.87182,1.611063,2.611063
2000-01-03,1.478118,2.478118,0.324888,0.675112,0.261084,1.261084
2000-01-04,,,,,,
2000-01-05,,,,,,
2000-01-06,,,,,,
2000-01-07,,,,,,
2000-01-08,0.596282,1.596282,0.101502,1.101502,1.027672,-0.027672
2000-01-09,1.263444,2.263444,0.650917,0.349083,0.720393,1.720393
2000-01-10,0.128905,0.871095,0.38411,1.38411,0.218166,0.781834


In [80]:
tsdf["A"].transform([np.abs, lambda x: x+1])

Unnamed: 0,absolute,<lambda>
2000-01-01,1.335675,2.335675
2000-01-02,0.653946,0.346054
2000-01-03,1.478118,2.478118
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,0.596282,1.596282
2000-01-09,1.263444,2.263444
2000-01-10,0.128905,0.871095


# Transforming with a dict

In [82]:
tsdf.transform({"A": np.abs, "B": lambda x: x+1})

Unnamed: 0,A,B
2000-01-01,1.335675,1.280823
2000-01-02,0.653946,1.87182
2000-01-03,1.478118,0.675112
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,0.596282,1.101502
2000-01-09,1.263444,0.349083
2000-01-10,0.128905,1.38411


In [83]:
tsdf.transform({"A": np.abs, "B": [lambda x: x+1, "sqrt"]})

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,A,B,B
Unnamed: 0_level_1,absolute,<lambda>,sqrt
2000-01-01,1.335675,1.280823,0.529927
2000-01-02,0.653946,1.87182,0.933713
2000-01-03,1.478118,0.675112,
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,0.596282,1.101502,0.318593
2000-01-09,1.263444,0.349083,
2000-01-10,0.128905,1.38411,0.619766


In [84]:
# Applying elementwise functions
df4 = df.copy()
df4

Unnamed: 0,one,two,three
a,0.568993,0.612676,
b,1.135524,0.13708,-0.363783
c,-0.952731,-0.702573,-1.1998
d,,-0.940072,-0.503477


In [85]:
def f(x):
    return len(str(x))

In [86]:
df4["one"].map(f)

a    18
b    18
c    19
d     3
Name: one, dtype: int64

In [87]:
df4.applymap(f)

Unnamed: 0,one,two,three
a,18,18,3
b,18,19,20
c,19,19,18
d,3,18,19


In [88]:
s = pd.Series(
    ["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"]
)
s

a      six
b    seven
c      six
d    seven
e      six
dtype: object

In [89]:
t = pd.Series({"six": 6.0, "seven": 7.0})
s.map(t)

a    6.0
b    7.0
c    6.0
d    7.0
e    6.0
dtype: float64

# Reindexing and altering labels

In [3]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    2.263296
b    0.431162
c    0.785294
d    0.322235
e   -0.313682
dtype: float64

In [4]:
s.reindex(["e", "b", "f", "d"])

e   -0.313682
b    0.431162
f         NaN
d    0.322235
dtype: float64

In [5]:
df

Unnamed: 0,one,two,three
a,0.26731,-0.155084,
b,-0.277125,-0.360525,-1.056927
c,0.701973,1.079157,-0.621042
d,,-0.194711,-0.003709


In [6]:
df.reindex(index=["c", "f", "b"], columns=["three", "two", "one"])

Unnamed: 0,three,two,one
c,-0.621042,1.079157,0.701973
f,,,
b,-1.056927,-0.360525,-0.277125


In [7]:
df.reindex(["c", "f", "b"], axis="index")

Unnamed: 0,one,two,three
c,0.701973,1.079157,-0.621042
f,,,
b,-0.277125,-0.360525,-1.056927


In [8]:
rs = s.reindex(df.index)
rs

a    2.263296
b    0.431162
c    0.785294
d    0.322235
dtype: float64

In [9]:
rs.index is df.index

True

In [11]:
df.reindex(["c", "f", "b"], axis="index")

Unnamed: 0,one,two,three
c,0.701973,1.079157,-0.621042
f,,,
b,-0.277125,-0.360525,-1.056927


In [12]:
df.reindex(["three", "two", "one"] ,axis="columns")

Unnamed: 0,three,two,one
a,,-0.155084,0.26731
b,-1.056927,-0.360525,-0.277125
c,-0.621042,1.079157,0.701973
d,-0.003709,-0.194711,


In [18]:
df2 = df[["one", "two"]].iloc[:3]
df.reindex_like(df2)

Unnamed: 0,one,two
a,0.26731,-0.155084
b,-0.277125,-0.360525
c,0.701973,1.079157


In [20]:
s = pd.Series(np.random.randn(5), index=list("abcde"))
s

a   -2.199156
b    0.200774
c    0.042291
d    0.199900
e    0.578596
dtype: float64

In [21]:
s1 = s[:4]
s2 = s1[1:]
s2

b    0.200774
c    0.042291
d    0.199900
dtype: float64

In [23]:
# The align() method is the fastest way to simultaneously align two objects. It supports a join argument
s1.align(s2)

(a   -2.199156
 b    0.200774
 c    0.042291
 d    0.199900
 dtype: float64,
 a         NaN
 b    0.200774
 c    0.042291
 d    0.199900
 dtype: float64)

In [25]:
s1.align(s2, join="inner")

(b    0.200774
 c    0.042291
 d    0.199900
 dtype: float64,
 b    0.200774
 c    0.042291
 d    0.199900
 dtype: float64)

In [26]:
s1.align(s2, join="left")

(a   -2.199156
 b    0.200774
 c    0.042291
 d    0.199900
 dtype: float64,
 a         NaN
 b    0.200774
 c    0.042291
 d    0.199900
 dtype: float64)

In [27]:
df.align(df2, join="inner")

(        one       two
 a  0.267310 -0.155084
 b -0.277125 -0.360525
 c  0.701973  1.079157,
         one       two
 a  0.267310 -0.155084
 b -0.277125 -0.360525
 c  0.701973  1.079157)

In [28]:
df.align(df2, join="inner", axis=0)

(        one       two     three
 a  0.267310 -0.155084       NaN
 b -0.277125 -0.360525 -1.056927
 c  0.701973  1.079157 -0.621042,
         one       two
 a  0.267310 -0.155084
 b -0.277125 -0.360525
 c  0.701973  1.079157)

In [29]:
df.align(df2.iloc[0], axis=1)

(        one     three       two
 a  0.267310       NaN -0.155084
 b -0.277125 -1.056927 -0.360525
 c  0.701973 -0.621042  1.079157
 d       NaN -0.003709 -0.194711,
 one      0.267310
 three         NaN
 two     -0.155084
 Name: a, dtype: float64)

# Filling while reindexing

In [31]:
rng = pd.date_range("2021-03-01", periods=8)
rng

DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06', '2021-03-07', '2021-03-08'],
              dtype='datetime64[ns]', freq='D')

In [32]:
ts = pd.Series(np.random.randn(8), index=rng)
ts

2021-03-01   -0.109026
2021-03-02   -0.906217
2021-03-03    1.666415
2021-03-04   -1.420859
2021-03-05   -1.720513
2021-03-06   -0.903269
2021-03-07   -0.339864
2021-03-08    0.246095
Freq: D, dtype: float64

In [33]:
ts2 = ts[[0, 3, 6]]
ts2

2021-03-01   -0.109026
2021-03-04   -1.420859
2021-03-07   -0.339864
Freq: 3D, dtype: float64

In [34]:
ts2.reindex(ts.index)

2021-03-01   -0.109026
2021-03-02         NaN
2021-03-03         NaN
2021-03-04   -1.420859
2021-03-05         NaN
2021-03-06         NaN
2021-03-07   -0.339864
2021-03-08         NaN
Freq: D, dtype: float64

In [35]:
ts2.reindex(ts.index, method="ffill")

2021-03-01   -0.109026
2021-03-02   -0.109026
2021-03-03   -0.109026
2021-03-04   -1.420859
2021-03-05   -1.420859
2021-03-06   -1.420859
2021-03-07   -0.339864
2021-03-08   -0.339864
Freq: D, dtype: float64

In [36]:
ts2.reindex(ts.index, method="bfill")

2021-03-01   -0.109026
2021-03-02   -1.420859
2021-03-03   -1.420859
2021-03-04   -1.420859
2021-03-05   -0.339864
2021-03-06   -0.339864
2021-03-07   -0.339864
2021-03-08         NaN
Freq: D, dtype: float64

In [37]:
ts2.reindex(ts.index, method="nearest")

2021-03-01   -0.109026
2021-03-02   -0.109026
2021-03-03   -1.420859
2021-03-04   -1.420859
2021-03-05   -1.420859
2021-03-06   -0.339864
2021-03-07   -0.339864
2021-03-08   -0.339864
Freq: D, dtype: float64

In [39]:
ts2.reindex(ts.index).fillna(method="ffill")

2021-03-01   -0.109026
2021-03-02   -0.109026
2021-03-03   -0.109026
2021-03-04   -1.420859
2021-03-05   -1.420859
2021-03-06   -1.420859
2021-03-07   -0.339864
2021-03-08   -0.339864
Freq: D, dtype: float64

In [40]:
ts2.reindex(ts.index, method="ffill", limit=1)

2021-03-01   -0.109026
2021-03-02   -0.109026
2021-03-03         NaN
2021-03-04   -1.420859
2021-03-05   -1.420859
2021-03-06         NaN
2021-03-07   -0.339864
2021-03-08   -0.339864
Freq: D, dtype: float64

In [43]:
ts2.reindex(ts.index, method="ffill", tolerance=" 1 day")

2021-03-01   -0.109026
2021-03-02   -0.109026
2021-03-03         NaN
2021-03-04   -1.420859
2021-03-05   -1.420859
2021-03-06         NaN
2021-03-07   -0.339864
2021-03-08   -0.339864
Freq: D, dtype: float64

In [44]:
df

Unnamed: 0,one,two,three
a,0.26731,-0.155084,
b,-0.277125,-0.360525,-1.056927
c,0.701973,1.079157,-0.621042
d,,-0.194711,-0.003709


In [45]:
df.drop(["a", "b"], axis=0)

Unnamed: 0,one,two,three
c,0.701973,1.079157,-0.621042
d,,-0.194711,-0.003709


In [46]:
df.drop(["one"], axis=1)

Unnamed: 0,two,three
a,-0.155084,
b,-0.360525,-1.056927
c,1.079157,-0.621042
d,-0.194711,-0.003709


In [50]:
df.reindex(df.index.difference(["a", "d"]))

Unnamed: 0,one,two,three
b,-0.277125,-0.360525,-1.056927
c,0.701973,1.079157,-0.621042


In [51]:
s

a   -2.199156
b    0.200774
c    0.042291
d    0.199900
e    0.578596
dtype: float64

In [52]:
s.rename(str.upper)

A   -2.199156
B    0.200774
C    0.042291
D    0.199900
E    0.578596
dtype: float64

In [53]:
df.rename(
    columns={"one": "foo", "two": "bar"},
    index={"a": "apple", "b": "banana", "d": "durian"},
)

Unnamed: 0,foo,bar,three
apple,0.26731,-0.155084,
banana,-0.277125,-0.360525,-1.056927
c,0.701973,1.079157,-0.621042
durian,,-0.194711,-0.003709


In [54]:
df.rename({"one": "foo", "two": "bar"}, axis="columns")

Unnamed: 0,foo,bar,three
a,0.26731,-0.155084,
b,-0.277125,-0.360525,-1.056927
c,0.701973,1.079157,-0.621042
d,,-0.194711,-0.003709


In [59]:
df.rename({"a": "apple", "b": "banana", "d": "durian"}, axis="index")

Unnamed: 0,one,two,three
apple,0.26731,-0.155084,
banana,-0.277125,-0.360525,-1.056927
c,0.701973,1.079157,-0.621042
durian,,-0.194711,-0.003709


In [60]:
s.rename("scalar-name")

a   -2.199156
b    0.200774
c    0.042291
d    0.199900
e    0.578596
Name: scalar-name, dtype: float64

In [61]:
df = pd.DataFrame(
    {"x": [1, 2, 3, 4, 5, 6], "y": [10, 20, 30, 40, 50, 60]},
    index=pd.MultiIndex.from_product(
        [["a", "b", "c"], [1, 2]], names=["let", "num"]
    ),
)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
let,num,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


In [63]:
df.rename_axis(index={"let": "abc"})

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
abc,num,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


In [64]:
df.rename_axis(index=str.upper)

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
LET,NUM,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,10
a,2,2,20
b,1,3,30
b,2,4,40
c,1,5,50
c,2,6,60


# Iteration

In [65]:
df = pd.DataFrame(
    {"col1": np.random.randn(3), "col2": np.random.randn(3)}, index=list("abc")
)
df

Unnamed: 0,col1,col2
a,-0.615132,-0.174063
b,1.268454,-1.566799
c,-0.424321,1.16062


In [66]:
for col in df:
    print(col)

col1
col2


In [67]:
df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


In [69]:
# Depending on the data types, the iterator returns a copy and not a view, and writing to it will have no effect!
for index, row in df.iterrows():
    row["a"] = 10
    
df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


In [72]:
for label, ser in df.items():
    print(label)
    print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


In [73]:
for row_index, row in df.iterrows():
    print(row_index, row, sep="\n")

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


In [76]:
df_orig = pd.DataFrame([[1, 1.5]], columns=["int", "float"])
df_orig.dtypes

int        int64
float    float64
dtype: object

In [77]:
row = next(df_orig.iterrows())[1]
row

int      1.0
float    1.5
Name: 0, dtype: float64

In [78]:
row["int"].dtype

dtype('float64')

In [79]:
df_orig["int"].dtype

dtype('int64')

In [82]:
df2 = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
df2

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,6


In [83]:
print(df2.T)

   0  1  2
x  1  2  3
y  4  5  6


In [87]:
df2_T = pd.DataFrame({idx: values for idx, values in df2.iterrows()})
print(df2_T)

   0  1  2
x  1  2  3
y  4  5  6


In [89]:
# The itertuples() method will return an iterator yielding a namedtuple for each row in the DataFrame.
for row in df.itertuples():
    print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')
