In [31]:
import numpy as np
import pandas as pd

### Series

In [32]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.034207
b   -0.736391
c    0.807280
d   -1.644674
e    0.562610
dtype: float64

In [33]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [34]:
pd.Series(np.random.randn(5))

0    0.334459
1   -0.025513
2   -0.185767
3   -0.536953
4   -1.972781
dtype: float64

In [35]:
d = {'b': 1, 'a': 0, 'c': 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [36]:
d = {'a': 0.0, 'b': 1.0, 'c': 2.0}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [37]:
pd.Series(d, index={'a', 'b', 'c', 'd'})

d    NaN
c    2.0
a    0.0
b    1.0
dtype: float64

In [38]:
pd.Series(d, index=['b', 'd'])

b    1.0
d    NaN
dtype: float64

In [39]:
pd.Series(5, index=['a', 'b', 'c', 'd'])

a    5
b    5
c    5
d    5
dtype: int64

### Series is ndarray-like

In [40]:
s

a    0.034207
b   -0.736391
c    0.807280
d   -1.644674
e    0.562610
dtype: float64

In [41]:
s.iloc[0]

np.float64(0.03420744970591772)

In [42]:
s.iloc[:3]

a    0.034207
b   -0.736391
c    0.807280
dtype: float64

In [43]:
s.median()

np.float64(0.03420744970591772)

In [44]:
s[s > s.median()]

c    0.80728
e    0.56261
dtype: float64

In [45]:
s.iloc[[4, 3, 1]]

e    0.562610
d   -1.644674
b   -0.736391
dtype: float64

In [46]:
np.exp(s)

a    1.034799
b    0.478839
c    2.241802
d    0.193076
e    1.755248
dtype: float64

In [47]:
s.dtype

dtype('float64')

In [48]:
s.array

<NumpyExtensionArray>
[np.float64(0.03420744970591772), np.float64(-0.7363914779896635),
  np.float64(0.8072801177619391), np.float64(-1.6446736968622606),
  np.float64(0.5626099621567902)]
Length: 5, dtype: float64

In [49]:
s.to_numpy()

array([ 0.03420745, -0.73639148,  0.80728012, -1.6446737 ,  0.56260996])

### Series is dict-like

In [50]:
s

a    0.034207
b   -0.736391
c    0.807280
d   -1.644674
e    0.562610
dtype: float64

In [51]:
s['a']

np.float64(0.03420744970591772)

In [52]:
s['e'] = 12.0

In [53]:
s

a     0.034207
b    -0.736391
c     0.807280
d    -1.644674
e    12.000000
dtype: float64

In [54]:
s['f'] = 1.0

In [55]:
s

a     0.034207
b    -0.736391
c     0.807280
d    -1.644674
e    12.000000
f     1.000000
dtype: float64

In [56]:
'e' in s

True

In [57]:
'f' in s

True

In [58]:
'g' in s

False

In [59]:
# s['g']

In [60]:
s.get('g')

In [61]:
s.get('g', np.nan)

nan

### Vectorized operations and label assignment with Series

In [62]:
s + s

a     0.068415
b    -1.472783
c     1.614560
d    -3.289347
e    24.000000
f     2.000000
dtype: float64

In [63]:
s * 2

a     0.068415
b    -1.472783
c     1.614560
d    -3.289347
e    24.000000
f     2.000000
dtype: float64

In [64]:
np.exp(s)

a         1.034799
b         0.478839
c         2.241802
d         0.193076
e    162754.791419
f         2.718282
dtype: float64

In [65]:
s.iloc[1:] + s.iloc[:-1]

a          NaN
b    -1.472783
c     1.614560
d    -3.289347
e    24.000000
f          NaN
dtype: float64

### Name attribute

In [66]:
s = pd.Series(np.random.randn(5), name='something')
s

0   -0.803860
1   -1.362852
2    0.114333
3   -0.528221
4   -0.127785
Name: something, dtype: float64

In [67]:
s.name

'something'

In [68]:
s2 = s.rename("different")
s2.name, s.name

('different', 'something')

In [69]:
# s and s2 refer to different objects.
id(s2) == id(s)

False

# DataFrame

In [70]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=['a', 'b', 'c']),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=['a', 'b', 'c', 'd'])
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [71]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [72]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [73]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['one', 'two', 'three'])

Unnamed: 0,one,two,three
d,,4.0,
b,2.0,2.0,
a,1.0,1.0,


In [74]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [75]:
df.columns

Index(['one', 'two'], dtype='object')

### From dict of ndarrays / lists

In [76]:
d = {
    "one": [1.0, 2.0, 3.0, 4.0],
    "two": [4.0, 3.0, 2.0, 1.0]
}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [77]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


### From structured or record array

In [78]:
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data

  data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])


array([(0, 0., b''), (0, 0., b'')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [79]:
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [80]:
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [81]:
pd.DataFrame(data, columns=["C", "A", "B"])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


### From a list of dicts

In [82]:
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [83]:
pd.DataFrame(data2, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [84]:
pd.DataFrame(data2, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


### From a dict of tuples

In [85]:
df_mif = pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)
df_mif

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [86]:
df_mif.columns

MultiIndex([('a', 'b'),
            ('a', 'a'),
            ('a', 'c'),
            ('b', 'a'),
            ('b', 'b')],
           )

In [87]:
df_mif.index

MultiIndex([('A', 'B'),
            ('A', 'C'),
            ('A', 'D')],
           )

### From a series

In [88]:
ser = pd.Series(range(3), index=list('abc'), name='ser')
ser

a    0
b    1
c    2
Name: ser, dtype: int64

In [89]:
pd.DataFrame(ser)

Unnamed: 0,ser
a,0
b,1
c,2


### From a list of namedtuples

In [90]:
from collections import namedtuple

In [91]:
Point = namedtuple("Point", "x y")

In [92]:
pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


In [93]:
Point3D = namedtuple("Point3D", "x y z")

In [94]:
pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)])

Unnamed: 0,x,y,z
0,0,0,0.0
1,0,3,5.0
2,2,3,


### From a list of dataclasses

In [95]:
from dataclasses import make_dataclass

In [96]:
Point = make_dataclass("Point", [("x", int), ("y", int)])

In [97]:
pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


### Alternate constructors

In [98]:
pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [99]:
pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient='index',
    columns=['one', 'two', 'three']
)

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


In [100]:
dffd = pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])
)
dffd

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [101]:
data

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [102]:
pd.DataFrame.from_records(data, index='C')

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
b'Hello',1,2.0
b'World',2,3.0


# Column selection, addition, deletion

In [103]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [104]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [105]:
df['three'] = df['one'] * df['two']

In [106]:
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [107]:
del df['two']

In [108]:
df

Unnamed: 0,one,three,flag
a,1.0,1.0,False
b,2.0,4.0,False
c,3.0,9.0,True
d,,,False


In [109]:
three = df.pop('three')

In [110]:
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [111]:
three

a    1.0
b    4.0
c    9.0
d    NaN
Name: three, dtype: float64

In [112]:
df['one_trunc'] = df['one'][:2]

In [113]:
df

Unnamed: 0,one,flag,one_trunc
a,1.0,False,1.0
b,2.0,False,2.0
c,3.0,True,
d,,False,


In [114]:
df.insert(1, "bar", df['one'])

In [115]:
df

Unnamed: 0,one,bar,flag,one_trunc
a,1.0,1.0,False,1.0
b,2.0,2.0,False,2.0
c,3.0,3.0,True,
d,,,False,


### Assigning new columns in method chains

In [118]:
data = {
    'a': list(range(3)),
    'b': list(range(3, 6))
}

In [125]:
df = pd.DataFrame(data, index=list('ABC'), columns=['a', "b"])
df

Unnamed: 0,a,b
A,0,3
B,1,4
C,2,5


In [131]:
new_df = df.assign(c=lambda x: x['a'] + x['b'])
new_df

Unnamed: 0,a,b,c
A,0,3,3
B,1,4,5
C,2,5,7


In [136]:
new_df.query("c >= 5").assign(
    d=lambda x: x.b + x.c,
    e=lambda x: x.c + x.d
)

Unnamed: 0,a,b,c,d,e
B,1,4,5,9,14
C,2,5,7,12,19


### Indexing / Selection

In [140]:
new_df.loc['B']

a    1
b    4
c    5
Name: B, dtype: int64

In [144]:
new_df.iloc[1]

a    1
b    4
c    5
Name: B, dtype: int64

### Data alignment and arithmetic

In [145]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
0,-0.794312,-0.8407,-0.839243,0.810189
1,-1.168536,-0.370068,0.87959,-0.924624
2,0.244707,-0.058891,0.678046,-0.371833
3,1.684533,-1.536581,1.017067,-0.492584
4,-0.251937,-0.07362,-0.449863,-0.347182
5,0.63941,-1.379026,-1.764542,-1.755236
6,0.591524,-1.923847,-1.311872,-1.081182
7,-1.163186,0.553939,-1.621797,-0.092126
8,-1.139441,-0.183574,-1.537992,0.641635
9,-0.449804,2.235504,1.733431,0.854747


In [146]:
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
df2

Unnamed: 0,A,B,C
0,-0.892333,1.169232,0.296356
1,-0.316371,0.457749,-1.19766
2,0.980434,-0.958346,0.867762
3,-1.408785,-0.206348,-1.982458
4,-0.087204,0.25797,-1.464354
5,-0.008501,1.644866,-1.232502
6,-0.465088,0.332615,0.396138


In [147]:
df + df2

Unnamed: 0,A,B,C,D
0,-1.686646,0.328533,-0.542887,
1,-1.484907,0.087681,-0.318071,
2,1.225141,-1.017237,1.545807,
3,0.275748,-1.742929,-0.965391,
4,-0.339142,0.18435,-1.914217,
5,0.630908,0.265841,-2.997044,
6,0.126435,-1.591232,-0.915733,
7,,,,
8,,,,
9,,,,


In [None]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-0.374224,0.470632,1.718833,-1.734813
2,1.039019,0.781808,1.517289,-1.182023
3,2.478846,-0.695881,1.85631,-1.302773
4,0.542375,0.767079,0.38938,-1.157372
5,1.433722,-0.538326,-0.925299,-2.565425
6,1.385836,-1.083147,-0.472629,-1.891372
7,-0.368874,1.394639,-0.782554,-0.902316
8,-0.345129,0.657125,-0.698749,-0.168554
9,0.344509,3.076204,2.572674,0.044558


In [150]:
df * 5 + 2

Unnamed: 0,A,B,C,D
0,-1.971561,-2.203499,-2.196215,6.050947
1,-3.842679,0.14966,6.397948,-2.62312
2,3.223535,1.705544,5.390229,0.140834
3,10.422667,-5.682905,7.085335,-0.46292
4,0.740313,1.631898,-0.249315,0.264089
5,5.19705,-4.895128,-6.82271,-6.776178
6,4.957618,-7.619234,-4.559358,-3.405912
7,-3.81593,4.769694,-6.108986,1.539368
8,-3.697206,1.082128,-5.689958,5.208176
9,-0.249018,13.177521,10.667156,6.273736


In [151]:
1 / df

Unnamed: 0,A,B,C,D
0,-1.258951,-1.189485,-1.19155,1.234279
1,-0.855772,-2.702205,1.136894,-1.081521
2,4.086519,-16.980438,1.474827,-2.689378
3,0.593636,-0.650796,0.983219,-2.030111
4,-3.96924,-13.583206,-2.222899,-2.880332
5,1.563942,-0.72515,-0.566719,-0.569724
6,1.69055,-0.519792,-0.76227,-0.924913
7,-0.859708,1.805254,-0.6166,-10.854645
8,-0.877623,-5.447381,-0.650199,1.558518
9,-2.223192,0.447326,0.576891,1.169937


In [152]:
df ** 4

Unnamed: 0,A,B,C,D
0,0.398075,0.499532,0.496079,0.43087
1,1.864524,0.018755,0.598577,0.730905
2,0.003586,1.2e-05,0.211366,0.019116
3,8.052274,5.574703,1.070036,0.058874
4,0.004029,2.9e-05,0.040956,0.014529
5,0.167154,3.616506,9.694559,9.491648
6,0.12243,13.698784,2.961866,1.366457
7,1.830614,0.094156,6.918091,7.2e-05
8,1.685652,0.001136,5.595202,0.169493
9,0.040935,24.974799,9.028725,0.533766


In [153]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)

In [154]:
df2 & df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [155]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [157]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [158]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


### Transposing

In [159]:
df[:5].T

Unnamed: 0,0,1,2,3,4
A,-0.794312,-1.168536,0.244707,1.684533,-0.251937
B,-0.8407,-0.370068,-0.058891,-1.536581,-0.07362
C,-0.839243,0.87959,0.678046,1.017067,-0.449863
D,0.810189,-0.924624,-0.371833,-0.492584,-0.347182


### DataFrame interoperability with NumPy functions

In [160]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,0.451892,0.431409,0.432037,2.248334
1,0.310822,0.690687,2.40991,0.396681
2,1.277247,0.942809,1.970024,0.689469
3,5.389936,0.215115,2.765073,0.611045
4,0.777293,0.929024,0.637716,0.706677
5,1.895362,0.251824,0.171265,0.172867
6,1.806739,0.146044,0.269316,0.339194
7,0.312489,1.740093,0.197543,0.91199
8,0.319998,0.83229,0.214812,1.899585
9,0.637753,9.351196,5.660041,2.35078


In [161]:
np.asarray(df)

array([[-0.79431219, -0.84069977, -0.83924295,  0.81018941],
       [-1.16853578, -0.37006809,  0.87958957, -0.92462402],
       [ 0.24470704, -0.0588913 ,  0.67804574, -0.37183312],
       [ 1.68453344, -1.53658092,  1.01706697, -0.49258391],
       [-0.25193738, -0.07362032, -0.4498629 , -0.34718225],
       [ 0.63940993, -1.37902552, -1.76454205, -1.75523555],
       [ 0.59152352, -1.92384688, -1.31187161, -1.08118241],
       [-1.16318604,  0.55393878, -1.62179727, -0.09212646],
       [-1.13944129, -0.18357447, -1.53799152,  0.64163529],
       [-0.44980369,  2.23550424,  1.73343119,  0.85474714]])

In [162]:
ser = pd.Series([1, 2, 3, 4])

In [163]:
np.exp(ser)

0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64

In [167]:
ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
ser2 = pd.Series([1, 2, 5], index=['b', 'a', 'c'])
ser1

a    1
b    2
c    3
dtype: int64

In [168]:
ser2

b    1
a    2
c    5
dtype: int64

In [169]:
np.remainder(ser1, ser2)

a    1
b    0
c    3
dtype: int64

In [None]:
e