In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
import xarray as xr
from numpy.random import default_rng
rng = default_rng()

In [3]:
# to show more columns (same for height and rows)
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 10)

## array creation

In [4]:
pd.Series(np.arange(10))

# in the latest pandas version dict keys has order and are not sorted when pd.Series is created; but better to control for the order (since older versions might be used)
pd.Series({'b': 5, 'a': 6, 'c': 7})

# from record array
data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
data[:] = [(1, 2., 'Hello'), (2, 3., "World")]
pd.DataFrame(data)

# multiindex df from dict of tuples
pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
    ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
    ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
    ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
    ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

b    5
a    6
c    7
dtype: int64

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


## dataframe structure

In [5]:
# index, columns, index.name, columns.name  

s1 = pd.Series([1, 2, 3, 4])
s1.name = 'seriesName'  # series name attribute
s1.to_frame()  # name assigned to column name

Unnamed: 0,seriesName
0,1
1,2
2,3
3,4


In [6]:
df = pd.DataFrame(rng.standard_normal(size=(10, 5)), columns=list('ABCDE'))

df.loc[:, 'A'] = df['A'].astype(int)
df.dtypes  # each column has separate dtype

arr = df.to_numpy()  # map to numpy array (new instead of .values)

A      int64
B    float64
C    float64
D    float64
E    float64
dtype: object

### instead of .values it's now recommended to use .to_numpy() or .array
* .array will return ExtensionArray, while .to_numpy() will return the underlying numpy array
* .to_numpy() will preserve the columns dtypes unlike .values
* recommended to avoid .values

### recommended to use numexpr and bottleneck libs together with pandas to improve performance

### recommended to use method chaining via .pipe(..)
Also recommended to use .apply(), .pipe(), assing() in chains

In [7]:
df.index  # pd.Index object (enhanced ndarray)
df.index.dtype  # also has dtype


idx = pd.Index(list('abcde'))

# set operations for index
idx.intersection(['a', 'd', 'f'])
idx.difference(['a', 'd', 'f'])

RangeIndex(start=0, stop=10, step=1)

dtype('int64')

Index(['a', 'd'], dtype='object')

Index(['b', 'c', 'e'], dtype='object')

## transforms

In [8]:
np.exp(df)  # numpy ufunc can be directly applied to dfs

Unnamed: 0,A,B,C,D,E
0,1.0,0.284935,0.220616,0.379378,0.577788
1,1.0,0.734707,0.757932,3.639181,3.426461
2,1.0,0.641202,0.130685,0.056933,0.944066
3,0.367879,2.671077,0.17999,3.045466,1.199649
4,2.718282,0.406107,0.324216,1.335973,2.606068
5,1.0,5.743279,0.141706,2.388818,3.272712
6,0.367879,0.92409,1.347554,1.342144,0.877569
7,1.0,0.885883,1.026472,2.714126,0.773955
8,1.0,0.223127,0.716671,2.114368,2.342586
9,1.0,0.27675,0.650226,1.028805,5.080646


In [9]:
df.pipe(np.exp).pipe(np.log)

Unnamed: 0,A,B,C,D,E
0,0.0,-1.255496,-1.51133,-0.969223,-0.548549
1,0.0,-0.308283,-0.277161,1.291759,1.231528
2,0.0,-0.444412,-2.034968,-2.865882,-0.057559
3,-1.0,0.982482,-1.714855,1.113654,0.182029
4,1.0,-0.901139,-1.126345,0.28966,0.957842
5,0.0,1.74803,-1.954,0.870799,1.185619
6,-1.0,-0.078946,0.298291,0.294268,-0.130599
7,0.0,-0.121171,0.026128,0.99847,-0.256241
8,0.0,-1.500014,-0.333139,0.748756,0.851255
9,0.0,-1.28464,-0.430435,0.028398,1.625438


In [10]:
df.apply(np.sum, axis=1)  # apply function to rows/columns

0   -4.284597
1    1.937842
2   -5.402820
3   -0.436690
4    0.220019
5    1.850448
6   -0.616986
7    0.647186
8   -0.233141
9   -0.061239
dtype: float64

In [11]:
df.applymap(lambda x: x**2)  # apply function individually to each element -- very slow! not recommended

Unnamed: 0,A,B,C,D,E
0,0,1.576269,2.284118,0.939392,0.300906
1,0,0.095039,0.076818,1.668641,1.516661
2,0,0.197502,4.141094,8.213278,0.003313
3,1,0.96527,2.940728,1.240225,0.033135
4,1,0.812051,1.268652,0.083903,0.917462
5,0,3.05561,3.818115,0.75829,1.405693
6,1,0.006232,0.088977,0.086594,0.017056
7,0,0.014682,0.000683,0.996942,0.06566
8,0,2.250042,0.110981,0.560636,0.724635
9,0,1.6503,0.185274,0.000806,2.64205


## indexing, queries, assignment

In [12]:
idx = np.arange(5)
cols = list('ABCDE')
idx_mi = pd.MultiIndex.from_tuples(zip(idx, list('ababb')))
cols_mi = pd.MultiIndex.from_tuples(zip(cols, list('XXYYY')))

df = pd.DataFrame(rng.standard_normal(size=(5, 5)), index=idx, columns=cols)
dfmi = pd.DataFrame(rng.standard_normal(size=(5, 5)), index=idx_mi, columns=cols_mi).reorder_levels([1, 0], axis=1).reorder_levels([1, 0], axis=0)

df
dfmi

Unnamed: 0,A,B,C,D,E
0,-0.436383,-1.641516,-0.88954,0.512324,0.181061
1,-0.348914,1.260242,-1.119045,0.008187,-1.117537
2,-0.751471,0.826254,-2.792355,-1.434212,0.743541
3,-0.536881,0.51785,-0.585159,-0.478154,-0.443618
4,1.204825,-0.027534,0.886986,0.884659,-0.367832


Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,-0.898994,1.954438,0.680617,-0.147608,-0.065124
b,1,-0.002318,-1.443408,-0.544832,0.381091,-0.01841
a,2,1.358589,0.75429,0.981196,0.928981,0.875559
b,3,-0.172053,-1.522566,-0.874787,-1.272389,-0.697787
b,4,-2.654524,-0.241441,1.261998,1.69109,-0.502061


### simple df

In [13]:
df['A']  # get
df['A'] = np.ones(5)  # adjust column
df

df['M'] = -1  # add new column

df.loc[2:4, ['A', 'M']] = -2  # change or get specify elements (use slices)
df

0   -0.436383
1   -0.348914
2   -0.751471
3   -0.536881
4    1.204825
Name: A, dtype: float64

Unnamed: 0,A,B,C,D,E
0,1.0,-1.641516,-0.88954,0.512324,0.181061
1,1.0,1.260242,-1.119045,0.008187,-1.117537
2,1.0,0.826254,-2.792355,-1.434212,0.743541
3,1.0,0.51785,-0.585159,-0.478154,-0.443618
4,1.0,-0.027534,0.886986,0.884659,-0.367832


Unnamed: 0,A,B,C,D,E,M
0,1.0,-1.641516,-0.88954,0.512324,0.181061,-1
1,1.0,1.260242,-1.119045,0.008187,-1.117537,-1
2,-2.0,0.826254,-2.792355,-1.434212,0.743541,-2
3,-2.0,0.51785,-0.585159,-0.478154,-0.443618,-2
4,-2.0,-0.027534,0.886986,0.884659,-0.367832,-2


In [14]:
#df['N']  # missing key
#df[['N', 'A']]  # one of the keys is missing
#df.loc[2, ['N', 'M']]  # missing key, all won't work

df.loc[2, ['M', 'N']] = -3  # assignment - any missing key will be added, extra positions extended with NaNs
df

# when assigning a Series : index intersection values will be assigned, others will be NaNs
df['E'] = pd.Series(1, index=np.arange(2, 16))  
df

# when list or ndarray is assigned to columns : must have the same length as column

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.641516,-0.88954,0.512324,0.181061,-1,
1,1.0,1.260242,-1.119045,0.008187,-1.117537,-1,
2,-2.0,0.826254,-2.792355,-1.434212,0.743541,-3,-3.0
3,-2.0,0.51785,-0.585159,-0.478154,-0.443618,-2,
4,-2.0,-0.027534,0.886986,0.884659,-0.367832,-2,


Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.641516,-0.88954,0.512324,,-1,
1,1.0,1.260242,-1.119045,0.008187,,-1,
2,-2.0,0.826254,-2.792355,-1.434212,1.0,-3,-3.0
3,-2.0,0.51785,-0.585159,-0.478154,1.0,-2,
4,-2.0,-0.027534,0.886986,0.884659,1.0,-2,


In [15]:
df.reindex(index=range(6))  # reindex -- new keys added with all NaNs
df.reindex(index=range(6), method='ffill')  # reindex + fill NaN with different methods

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.641516,-0.88954,0.512324,,-1.0,
1,1.0,1.260242,-1.119045,0.008187,,-1.0,
2,-2.0,0.826254,-2.792355,-1.434212,1.0,-3.0,-3.0
3,-2.0,0.51785,-0.585159,-0.478154,1.0,-2.0,
4,-2.0,-0.027534,0.886986,0.884659,1.0,-2.0,
5,,,,,,,


Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.641516,-0.88954,0.512324,,-1,
1,1.0,1.260242,-1.119045,0.008187,,-1,
2,-2.0,0.826254,-2.792355,-1.434212,1.0,-3,-3.0
3,-2.0,0.51785,-0.585159,-0.478154,1.0,-2,
4,-2.0,-0.027534,0.886986,0.884659,1.0,-2,
5,-2.0,-0.027534,0.886986,0.884659,1.0,-2,


In [16]:
# auto alignment to index union
s1 = pd.Series(np.arange(5), index=np.arange(5))
s2 = pd.Series(np.arange(5, 10), index=np.arange(1, 6))
s1 + s2  # auto NaNs for missing indices (resulting series has index union as index)
(s1 + s2).dropna()  # drop NaN values

0     NaN
1     6.0
2     8.0
3    10.0
4    12.0
5     NaN
dtype: float64

1     6.0
2     8.0
3    10.0
4    12.0
dtype: float64

In [17]:
# assign method : can pass arrays, series or lambda function (on the current df)
df.assign(F=lambda x: x['E']+ x['D'])

Unnamed: 0,A,B,C,D,E,M,N,F
0,1.0,-1.641516,-0.88954,0.512324,,-1,,
1,1.0,1.260242,-1.119045,0.008187,,-1,,
2,-2.0,0.826254,-2.792355,-1.434212,1.0,-3,-3.0,-0.434212
3,-2.0,0.51785,-0.585159,-0.478154,1.0,-2,,0.521846
4,-2.0,-0.027534,0.886986,0.884659,1.0,-2,,1.884659


In [18]:
x = 0.5
df.query('E > @x')  # query : pass str with col names, @ to unpack exterior variables

Unnamed: 0,A,B,C,D,E,M,N
2,-2.0,0.826254,-2.792355,-1.434212,1.0,-3,-3.0
3,-2.0,0.51785,-0.585159,-0.478154,1.0,-2,
4,-2.0,-0.027534,0.886986,0.884659,1.0,-2,


In [19]:
df.loc[2]  # row selection
df.loc[2] = -4  # assignment also works
df

A   -2.000000
B    0.826254
C   -2.792355
D   -1.434212
E    1.000000
M   -3.000000
N   -3.000000
Name: 2, dtype: float64

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.641516,-0.88954,0.512324,,-1,
1,1.0,1.260242,-1.119045,0.008187,,-1,
2,-4.0,-4.0,-4.0,-4.0,-4.0,-4,-4.0
3,-2.0,0.51785,-0.585159,-0.478154,1.0,-2,
4,-2.0,-0.027534,0.886986,0.884659,1.0,-2,


In [20]:
df.loc[8] = 4  # assinging to non-existent row
df

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.641516,-0.88954,0.512324,,-1,
1,1.0,1.260242,-1.119045,0.008187,,-1,
2,-4.0,-4.0,-4.0,-4.0,-4.0,-4,-4.0
3,-2.0,0.51785,-0.585159,-0.478154,1.0,-2,
4,-2.0,-0.027534,0.886986,0.884659,1.0,-2,
8,4.0,4.0,4.0,4.0,4.0,4,4.0


In [21]:
df1 = df.copy()
df1.loc[1] = 5
df1.combine_first(df)  # combine dfs, priority to the first df

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.641516,-0.88954,0.512324,,-1,
1,5.0,5.0,5.0,5.0,5.0,5,5.0
2,-4.0,-4.0,-4.0,-4.0,-4.0,-4,-4.0
3,-2.0,0.51785,-0.585159,-0.478154,1.0,-2,
4,-2.0,-0.027534,0.886986,0.884659,1.0,-2,
8,4.0,4.0,4.0,4.0,4.0,4,4.0


In [22]:
df['A'].idxmin()  # idx min

2

### multi-index indexing

In [23]:
dfmi['X']  # outer level indexing
dfmi.loc['a']
dfmi.loc['a', 'X']

Unnamed: 0,Unnamed: 1,A,B
a,0,-0.898994,1.954438
b,1,-0.002318,-1.443408
a,2,1.358589,0.75429
b,3,-0.172053,-1.522566
b,4,-2.654524,-0.241441


Unnamed: 0_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,A,B,C,D,E
0,-0.898994,1.954438,0.680617,-0.147608,-0.065124
2,1.358589,0.75429,0.981196,0.928981,0.875559


Unnamed: 0,A,B
0,-0.898994,1.954438
2,1.358589,0.75429


In [24]:
dfmi[('X', 'A')]  # multiple level indexing
dfmi.loc[('a', 2)]
dfmi.loc[('a', 2), ('X', 'A')]

#dfmi.loc[('b', [3,4])]  # won't work
dfmi.loc['b'].loc[[3,4]]
dfmi.loc['b'].loc[slice(3,4)]

a  0   -0.898994
b  1   -0.002318
a  2    1.358589
b  3   -0.172053
   4   -2.654524
Name: (X, A), dtype: float64

X  A    1.358589
   B    0.754290
Y  C    0.981196
   D    0.928981
   E    0.875559
Name: (a, 2), dtype: float64

1.3585893986259112

Unnamed: 0_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,A,B,C,D,E
3,-0.172053,-1.522566,-0.874787,-1.272389,-0.697787
4,-2.654524,-0.241441,1.261998,1.69109,-0.502061


Unnamed: 0_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,A,B,C,D,E
3,-0.172053,-1.522566,-0.874787,-1.272389,-0.697787
4,-2.654524,-0.241441,1.261998,1.69109,-0.502061


In [25]:
dfmi.loc[('b', [3,10]), :]  # won't complain that 10 is missing (will simply skip)
# dfmi.loc['b'].loc[[3, 10]]  # won't work, 10 is missing key
# dfmi.loc[('b', [3,10])]  # also won't work

Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
b,3,-0.172053,-1.522566,-0.874787,-1.272389,-0.697787


In [26]:
# multi-index assignment

dfmi.loc[('a', 0), ('X', 'A')] = -5  # individual
dfmi

dfmi['X'] = rng.standard_normal(size=(5, 2))  # to outter level column
dfmi[('X', 'A')] = rng.standard_normal(size=(5, 1))  # to multiple column levels

dfmi.loc['a', 'X'] = rng.standard_normal(size=(2, 2))  # to col-row selection (outer)
dfmi.loc[('a', [0, 2]), 'X'] = rng.standard_normal(size=(2, 2))  # mixed
dfmi.loc['b', ('Y', slice('C', 'D'))] = rng.standard_normal(size=(3, 2))  # using slice
dfmi.loc[:, ('Y', slice('C', 'D'))] = rng.standard_normal(size=(5, 2))  # skipping level

Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,-5.0,1.954438,0.680617,-0.147608,-0.065124
b,1,-0.002318,-1.443408,-0.544832,0.381091,-0.01841
a,2,1.358589,0.75429,0.981196,0.928981,0.875559
b,3,-0.172053,-1.522566,-0.874787,-1.272389,-0.697787
b,4,-2.654524,-0.241441,1.261998,1.69109,-0.502061


## xarray and multi-index dataframes

In [27]:
dfmi.columns.get_level_values(1)

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [28]:
dfmi.to_xarray()

In [29]:
xarr = df.to_xarray()

In [30]:
dfmi_c = dfmi.copy()
dfmi_c.index = dfmi_c.index.droplevel(0)
dfmi_c

Unnamed: 0_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,A,B,C,D,E
0,1.092035,1.154421,-1.869839,1.131734,-0.065124
1,-0.026844,-0.944534,0.870701,-0.546996,-0.01841
2,-1.982865,0.304443,0.499251,0.493117,0.875559
3,1.092661,-0.179011,-0.477309,0.20524,-0.697787
4,0.938481,0.754514,-0.2525,0.896321,-0.502061


In [60]:
# xr data array from simple pandas dataframe
df_tmp = dfmi_c['X']
df_tmp.index = df_tmp.index.droplevel(0)
df_tmp.index.name = 'index'
df_tmp.columns.name = 'cols'

xarr = xr.DataArray(df_tmp)  
xarr

In [64]:
# xr data array from MultiIndex dataframe
df_tmp = dfmi_c.copy()
df_tmp.index = df_tmp.index.droplevel(0)
df_tmp.index.name = 'index'
df_tmp.columns.names = ['outer',  'inner']

df_tmp = df_tmp.stack('outer')  # outer columns level will be the fields inside dataset (not symbols)
df_tmp.to_xarray()

In [32]:
xds = xr.Dataset(
     {
         "X": (
             ("index", "cols"),
             dfmi_c['X'].to_numpy(),
         )
     },
     coords={"index": dfmi_c['X'].index, "cols": dfmi_c['X'].columns},
 )

In [33]:
df_tmp = dfmi_c['X']**2
df_tmp.index.name = 'index'
df_tmp.columns.name = 'cols'
xds = xds.assign(Y=df_tmp)

In [34]:
xds.to_dataframe().unstack('cols')

Unnamed: 0_level_0,X,X,Y,Y
cols,A,B,A,B
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1.092035,1.154421,1.192541,1.332687
1,-0.026844,-0.944534,0.000721,0.892144
2,-1.982865,0.304443,3.931753,0.092686
3,1.092661,-0.179011,1.193908,0.032045
4,0.938481,0.754514,0.880747,0.569292


In [35]:
xarr.dim_1

### more methods

In [36]:
s = pd.Series([1, 2, 2, 1, 3, 4, 1])
s.value_counts()  # count frequencies

1    3
2    2
4    1
3    1
dtype: int64

In [37]:
# cut qcut
arr = rng.normal(1, size=100)

pd.cut(arr, [-2, -1, 0, 1, 2])  # specify bin for each value, arg = bin edges

pd.qcut(arr, 10)  # same but bin edges are given by quantiles

[(1.0, 2.0], (0.0, 1.0], (1.0, 2.0], (0.0, 1.0], NaN, ..., NaN, (0.0, 1.0], (0.0, 1.0], (-1.0, 0.0], NaN]
Length: 100
Categories (4, interval[int64]): [(-2, -1] < (-1, 0] < (0, 1] < (1, 2]]

[(1.154, 1.493], (0.643, 0.768], (1.154, 1.493], (0.463, 0.643], (2.574, 3.34], ..., (1.998, 2.574], (0.463, 0.643], (0.643, 0.768], (-1.581, -0.251], (2.574, 3.34]]
Length: 100
Categories (10, interval[float64]): [(-1.581, -0.251] < (-0.251, 0.463] < (0.463, 0.643] < (0.643, 0.768] ... (1.493, 1.682] < (1.682, 1.998] < (1.998, 2.574] < (2.574, 3.34]]

In [38]:
df = pd.DataFrame(rng.normal(1, size=(10, 5)))

df.agg(['sum', 'mean', lambda x: x.sum()/(x**2).sum()])  # aggregation using specification or function

df.agg({2: 'mean', 4: lambda x: x.sum()/(x**2).sum()})  # aggregation with dict (keys are some columns)

Unnamed: 0,0,1,2,3,4
sum,13.025109,8.411693,9.925012,14.756594,9.220737
mean,1.302511,0.841169,0.992501,1.475659,0.922074
<lambda_0>,0.544793,0.634045,0.445146,0.373499,0.68665


2    0.992501
4    0.686650
dtype: float64

In [39]:
# transform : with 1 function similar to ufunc, with multiple functions : multiindex is returned
df.transform([np.abs, np.sign])  

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4
Unnamed: 0_level_1,absolute,sign,absolute,sign,absolute,sign,absolute,sign,absolute,sign
0,0.358144,1.0,2.110127,1.0,0.358621,1.0,3.299538,1.0,0.790954,1.0
1,2.097706,1.0,1.900722,1.0,0.6608,-1.0,0.400776,1.0,1.841167,1.0
2,2.84353,1.0,0.685563,1.0,2.74878,1.0,3.584302,1.0,0.055578,-1.0
3,0.780657,1.0,1.384621,1.0,2.406934,1.0,0.693069,1.0,0.902938,1.0
4,1.702281,1.0,0.458253,1.0,1.852313,1.0,1.356744,1.0,1.750373,1.0
5,1.039405,1.0,1.505367,1.0,1.967069,1.0,0.190454,1.0,0.497382,-1.0
6,0.19502,1.0,0.056863,1.0,0.143962,1.0,0.7592,-1.0,1.411572,1.0
7,1.39405,1.0,0.33431,1.0,0.335509,1.0,1.958699,1.0,0.964674,1.0
8,2.11579,1.0,0.345799,-1.0,0.183553,-1.0,2.613877,1.0,1.311022,1.0
9,0.498525,1.0,0.321666,1.0,0.956176,1.0,1.418336,1.0,0.800997,1.0


In [40]:
s = pd.Series(rng.standard_normal(size=5), index=['a', 'b', 'c', 'd', 'e'])

s.reindex(['e', 'b', 'f', 'd'])
#s.loc[['e', 'b', 'f', 'd']]  # this won't work

s.index = pd.MultiIndex.from_tuples(zip(s.index, range(5)))
s
s.loc[(['e', 'b', 'f', 'd'], 3)]  # for multiindex loc - this call won't complain about missing value 'f', would return empty Series if just 'f' passed

e    0.721702
b   -0.102135
f         NaN
d    1.505692
dtype: float64

a  0   -1.764375
b  1   -0.102135
c  2    0.582604
d  3    1.505692
e  4    0.721702
dtype: float64

d  3    1.505692
dtype: float64

In [41]:
s.reindex_like(s.iloc[:3])

a  0   -1.764375
b  1   -0.102135
c  2    0.582604
dtype: float64

In [42]:
# aligning

s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s1 = s[:4]
s2 = s[1:]

s1.align(s2)
s1.align(s2, join='inner')  # index intersection
s1.align(s2, join='left')  # index from s1

(a   -0.276867
 b    1.574290
 c    1.221033
 d    0.137324
 e         NaN
 dtype: float64,
 a         NaN
 b    1.574290
 c    1.221033
 d    0.137324
 e    0.332072
 dtype: float64)

(b    1.574290
 c    1.221033
 d    0.137324
 dtype: float64,
 b    1.574290
 c    1.221033
 d    0.137324
 dtype: float64)

(a   -0.276867
 b    1.574290
 c    1.221033
 d    0.137324
 dtype: float64,
 a         NaN
 b    1.574290
 c    1.221033
 d    0.137324
 dtype: float64)

In [43]:
s = pd.Series([1, 2, 3, np.nan, np.nan, np.nan, 3, 2, np.nan])
s

s.ffill(limit=2)  # ffill max, limit specifies the max ffill horizon
# tolerance argument specifies the max index distance for ffill

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    NaN
6    3.0
7    2.0
8    NaN
dtype: float64

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    NaN
6    3.0
7    2.0
8    2.0
dtype: float64

Index(['a', 'd'], dtype='object')

Index(['b', 'c', 'e'], dtype='object')

In [44]:
s = pd.Series(0, index=list('abcde'))
s.rename(str.upper)  # rename take a function to be applied to index, cols, or a dict#
s.rename({'a': 1, 'b': 2})

A    0
B    0
C    0
D    0
E    0
dtype: int64

1    0
2    0
c    0
d    0
e    0
dtype: int64

* iterating through pandas rows or columns is generally quite slow
* never modify a df while iterating over it

In [45]:
df = pd.DataFrame(rng.standard_normal(size=(10, 5)), columns=list('ABCDE'))

for idx, row in df.iterrows():
    print(idx, row['A'])

print('\n')

for col, ser in df.items():
    print(col, ser[0])


0 -0.14679375166518555
1 0.36812396565892314
2 -1.548684853154456
3 0.33190554799829286
4 -0.9787317539279442
5 0.6452847078690606
6 1.4936863077795022
7 1.1510284517405411
8 -0.2843230817504125
9 -0.7222785609058526


A -0.14679375166518555
B -0.25484794018001033
C 0.01978932911149114
D 0.4975885345607593
E 0.5686599430480725


In [46]:
# special accessors
s = pd.Series(['a', 'b', 'c'])
s
s.str.upper()  # str methods

s = pd.Series(pd.date_range('2020-01-01', '2020-01-03'))
s.dt.hour  # datetime methods

0    a
1    b
2    c
dtype: object

0    A
1    B
2    C
dtype: object

0    0
1    0
2    0
dtype: int64

In [47]:
s.dt.tz_localize('UTC')
s.to_numpy()

0   2020-01-01 00:00:00+00:00
1   2020-01-02 00:00:00+00:00
2   2020-01-03 00:00:00+00:00
dtype: datetime64[ns, UTC]

array(['2020-01-01T00:00:00.000000000', '2020-01-02T00:00:00.000000000',
       '2020-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

pandas mostly uses dtype from numpy

In [48]:
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'),
 'B': pd.Series(np.random.randn(8)),
 'C': pd.Series(np.array(np.random.randn(8),
 dtype='uint8'))})

df2.dtypes  # dtypes are preserved 

A    float16
B    float64
C      uint8
dtype: object

### indexing

### merge, join, concatenate

Unnamed: 0,A,B,C,D,E
3,-0.425278,0.340483,-0.59525,0.140818,-1.760717
4,0.81294,-1.669278,1.258363,-1.730505,0.11899
1,-0.040219,-2.567358,0.248036,-0.6074,0.261512
2,-0.878274,0.131925,1.054028,-1.37073,-0.605432
0,-0.867174,0.097179,1.101313,0.612201,-0.338587


Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
3,0,0.313713,-0.110327,-0.629369,-0.745841,-0.007229
4,1,-0.069496,-1.452713,1.624539,0.185257,0.413964
1,2,1.961134,0.228735,0.195359,-0.305363,-0.12892
2,3,0.980989,0.296938,1.368777,-0.626953,0.24813
0,4,-0.338303,-2.006355,0.740023,-0.123068,0.25991


In [49]:
df['A']
df.loc[2]

0   -0.146794
1    0.368124
2   -1.548685
3    0.331906
4   -0.978732
5    0.645285
6    1.493686
7    1.151028
8   -0.284323
9   -0.722279
Name: A, dtype: float64

A   -1.548685
B    0.402558
C    1.252560
D   -1.290878
E   -0.356688
Name: 2, dtype: float64

In [50]:
dfmi['X']

# dfmi['W']  # key error

dfmi[('X', 'A')]  
#dfmi[('W', 'A')]  # won't work, key error

dfmi.loc[:, (['W'], 'A')]  # returns empty df, despite missing col 'W', won't complain

Unnamed: 0,Unnamed: 1,A,B
a,0,1.092035,1.154421
b,1,-0.026844,-0.944534
a,2,-1.982865,0.304443
b,3,1.092661,-0.179011
b,4,0.938481,0.754514


a  0    1.092035
b  1   -0.026844
a  2   -1.982865
b  3    1.092661
   4    0.938481
Name: (X, A), dtype: float64

a,0
b,1
a,2
b,3
b,4


In [51]:
dfmi_c = dfmi.copy()
dfmi_c.loc[:, ('X', slice('A', 'C'))] = -1  # use loc to assign
dfmi_c

dfmi_c = dfmi.copy()
dfmi_c.loc[:, ('X', ['A', 'B'])] = np.ones((5, 2))  # use slice to assign to multiple MI cols
dfmi_c

dfmi_c = dfmi.copy()
dfmi_c.loc[:, ('X', slice(None))] = np.ones((5, 2))  # slice(None) : assign to all
dfmi_c

Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,-1,-1,-1.869839,1.131734,-0.065124
b,1,-1,-1,0.870701,-0.546996,-0.01841
a,2,-1,-1,0.499251,0.493117,0.875559
b,3,-1,-1,-0.477309,0.20524,-0.697787
b,4,-1,-1,-0.2525,0.896321,-0.502061


Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,1.0,1.0,-1.869839,1.131734,-0.065124
b,1,1.0,1.0,0.870701,-0.546996,-0.01841
a,2,1.0,1.0,0.499251,0.493117,0.875559
b,3,1.0,1.0,-0.477309,0.20524,-0.697787
b,4,1.0,1.0,-0.2525,0.896321,-0.502061


Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,1.0,1.0,-1.869839,1.131734,-0.065124
b,1,1.0,1.0,0.870701,-0.546996,-0.01841
a,2,1.0,1.0,0.499251,0.493117,0.875559
b,3,1.0,1.0,-0.477309,0.20524,-0.697787
b,4,1.0,1.0,-0.2525,0.896321,-0.502061


Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
4,0,1.0,1.0,0.850048,-0.288182,-0.239323
0,1,1.0,1.0,-0.728966,-0.397659,-0.016662
3,2,1.0,1.0,0.268244,-1.219105,1.102786
1,3,1.0,1.0,1.484307,0.49707,1.877421
2,4,1.0,1.0,0.585966,1.385612,1.5152
