In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
import xarray as xr
from numpy.random import default_rng
rng = default_rng()

In [3]:
# to show more columns (same for height and rows)
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 10)

## array creation

In [4]:
pd.Series(np.arange(10))

# in the latest pandas version dict keys has order and are not sorted when pd.Series is created; but better to control for the order (since older versions might be used)
pd.Series({'b': 5, 'a': 6, 'c': 7})

# from record array
data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
data[:] = [(1, 2., 'Hello'), (2, 3., "World")]
pd.DataFrame(data)

# multiindex df from dict of tuples
pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
    ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
    ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
    ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
    ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

b    5
a    6
c    7
dtype: int64

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


## dataframe structure

In [5]:
# index, columns, index.name, columns.name  

s1 = pd.Series([1, 2, 3, 4])
s1.name = 'seriesName'  # series name attribute
s1.to_frame()  # name assigned to column name

Unnamed: 0,seriesName
0,1
1,2
2,3
3,4


In [6]:
df = pd.DataFrame(rng.standard_normal(size=(10, 5)), columns=list('ABCDE'))

df.loc[:, 'A'] = df['A'].astype(int)
df.dtypes  # each column has separate dtype

arr = df.to_numpy()  # map to numpy array (new instead of .values)

A      int64
B    float64
C    float64
D    float64
E    float64
dtype: object

### instead of .values it's now recommended to use .to_numpy() or .array
* .array will return ExtensionArray, while .to_numpy() will return the underlying numpy array
* .to_numpy() will preserve the columns dtypes unlike .values
* recommended to avoid .values

### recommended to use numexpr and bottleneck libs together with pandas to improve performance

### recommended to use method chaining via .pipe(..)
Also recommended to use .apply(), .pipe(), assing() in chains

In [7]:
df.index  # pd.Index object (enhanced ndarray)
df.index.dtype  # also has dtype


idx = pd.Index(list('abcde'))

# set operations for index
idx.intersection(['a', 'd', 'f'])
idx.difference(['a', 'd', 'f'])

RangeIndex(start=0, stop=10, step=1)

dtype('int64')

Index(['a', 'd'], dtype='object')

Index(['b', 'c', 'e'], dtype='object')

## transforms

In [8]:
np.exp(df)  # numpy ufunc can be directly applied to dfs

Unnamed: 0,A,B,C,D,E
0,2.718282,0.929357,4.35527,1.598802,0.466404
1,1.0,1.17619,11.429937,1.518266,3.442129
2,0.367879,5.436713,0.146708,2.523187,0.804224
3,1.0,0.723333,0.939666,12.573446,0.265868
4,0.367879,0.355141,2.845751,0.557776,3.923175
5,1.0,0.251928,0.705121,0.407826,0.765101
6,1.0,0.814742,7.157554,0.576092,3.609155
7,0.367879,10.78968,0.422208,1.91692,1.9256
8,0.367879,0.233544,0.683013,5.067936,1.356913
9,1.0,0.811393,0.616263,1.566291,1.668159


In [9]:
df.pipe(np.exp).pipe(np.log)

Unnamed: 0,A,B,C,D,E
0,1.0,-0.073262,1.471387,0.469255,-0.762704
1,0.0,0.16228,2.436236,0.417569,1.23609
2,-1.0,1.693175,-1.919308,0.925523,-0.217878
3,0.0,-0.323886,-0.062231,2.531587,-1.324755
4,-1.0,-1.035239,1.045827,-0.583797,1.366901
5,0.0,-1.378611,-0.349386,-0.896914,-0.267747
6,0.0,-0.204884,1.968168,-0.551488,1.283474
7,-1.0,2.37859,-0.862257,0.65072,0.655238
8,-1.0,-1.454385,-0.381242,1.622934,0.305212
9,0.0,-0.209003,-0.484081,0.448711,0.51172


In [10]:
df.apply(np.sum, axis=1)  # apply function to rows/columns

0    2.104676
1    4.252175
2   -0.518489
3    0.820716
4   -0.206308
5   -2.892658
6    2.495271
7    1.822291
8   -0.907480
9    0.267347
dtype: float64

In [11]:
df.applymap(lambda x: x**2)  # apply function individually to each element -- very slow! not recommended

Unnamed: 0,A,B,C,D,E
0,1,0.005367,2.164978,0.2202,0.581717
1,0,0.026335,5.935246,0.174364,1.527919
2,1,2.866841,3.683745,0.856592,0.047471
3,0,0.104902,0.003873,6.408933,1.754975
4,1,1.071721,1.093754,0.340819,1.868419
5,0,1.900568,0.122071,0.804455,0.071689
6,0,0.041977,3.873687,0.304139,1.647305
7,1,5.657691,0.743486,0.423436,0.429336
8,1,2.115234,0.145345,2.633913,0.093155
9,0,0.043682,0.234335,0.201341,0.261858


## indexing, queries, assignment

In [12]:
idx = np.arange(5)
cols = list('ABCDE')
idx_mi = pd.MultiIndex.from_tuples(zip(idx, list('ababb')))
cols_mi = pd.MultiIndex.from_tuples(zip(cols, list('XXYYY')))

df = pd.DataFrame(rng.standard_normal(size=(5, 5)), index=idx, columns=cols)
dfmi = pd.DataFrame(rng.standard_normal(size=(5, 5)), index=idx_mi, columns=cols_mi).reorder_levels([1, 0], axis=1).reorder_levels([1, 0], axis=0)

df
dfmi

Unnamed: 0,A,B,C,D,E
0,-0.587041,-1.421001,1.497859,1.027689,0.702617
1,-0.896502,1.230444,0.79299,-1.539267,0.681518
2,0.463917,-0.439513,1.29659,-0.584431,-0.026376
3,0.290353,-0.533546,-0.53082,-0.288116,-0.028181
4,-0.92981,1.313871,-0.028742,0.478155,0.784489


Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,1.609335,-0.42889,-0.874882,1.653949,-0.159041
b,1,1.522344,0.729334,-0.427182,1.439313,-0.989335
a,2,0.17901,-1.134496,0.000474,-0.325043,-0.536254
b,3,-1.82098,-1.81704,1.10203,0.220951,2.631492
b,4,0.04826,-0.515004,0.546239,0.495834,-0.609241


### simple df

In [13]:
df['A']  # get
df['A'] = np.ones(5)  # adjust column
df

df['M'] = -1  # add new column

df.loc[2:4, ['A', 'M']] = -2  # change or get specify elements (use slices)
df

0   -0.587041
1   -0.896502
2    0.463917
3    0.290353
4   -0.929810
Name: A, dtype: float64

Unnamed: 0,A,B,C,D,E
0,1.0,-1.421001,1.497859,1.027689,0.702617
1,1.0,1.230444,0.79299,-1.539267,0.681518
2,1.0,-0.439513,1.29659,-0.584431,-0.026376
3,1.0,-0.533546,-0.53082,-0.288116,-0.028181
4,1.0,1.313871,-0.028742,0.478155,0.784489


Unnamed: 0,A,B,C,D,E,M
0,1.0,-1.421001,1.497859,1.027689,0.702617,-1
1,1.0,1.230444,0.79299,-1.539267,0.681518,-1
2,-2.0,-0.439513,1.29659,-0.584431,-0.026376,-2
3,-2.0,-0.533546,-0.53082,-0.288116,-0.028181,-2
4,-2.0,1.313871,-0.028742,0.478155,0.784489,-2


In [14]:
#df['N']  # missing key
#df[['N', 'A']]  # one of the keys is missing
#df.loc[2, ['N', 'M']]  # missing key, all won't work

df.loc[2, ['M', 'N']] = -3  # assignment - any missing key will be added, extra positions extended with NaNs
df

# when assigning a Series : index intersection values will be assigned, others will be NaNs
df['E'] = pd.Series(1, index=np.arange(2, 16))  
df

# when list or ndarray is assigned to columns : must have the same length as column

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.421001,1.497859,1.027689,0.702617,-1,
1,1.0,1.230444,0.79299,-1.539267,0.681518,-1,
2,-2.0,-0.439513,1.29659,-0.584431,-0.026376,-3,-3.0
3,-2.0,-0.533546,-0.53082,-0.288116,-0.028181,-2,
4,-2.0,1.313871,-0.028742,0.478155,0.784489,-2,


Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.421001,1.497859,1.027689,,-1,
1,1.0,1.230444,0.79299,-1.539267,,-1,
2,-2.0,-0.439513,1.29659,-0.584431,1.0,-3,-3.0
3,-2.0,-0.533546,-0.53082,-0.288116,1.0,-2,
4,-2.0,1.313871,-0.028742,0.478155,1.0,-2,


In [15]:
df.reindex(index=range(6))  # reindex -- new keys added with all NaNs
df.reindex(index=range(6), method='ffill')  # reindex + fill NaN with different methods

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.421001,1.497859,1.027689,,-1.0,
1,1.0,1.230444,0.79299,-1.539267,,-1.0,
2,-2.0,-0.439513,1.29659,-0.584431,1.0,-3.0,-3.0
3,-2.0,-0.533546,-0.53082,-0.288116,1.0,-2.0,
4,-2.0,1.313871,-0.028742,0.478155,1.0,-2.0,
5,,,,,,,


Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.421001,1.497859,1.027689,,-1,
1,1.0,1.230444,0.79299,-1.539267,,-1,
2,-2.0,-0.439513,1.29659,-0.584431,1.0,-3,-3.0
3,-2.0,-0.533546,-0.53082,-0.288116,1.0,-2,
4,-2.0,1.313871,-0.028742,0.478155,1.0,-2,
5,-2.0,1.313871,-0.028742,0.478155,1.0,-2,


In [16]:
# auto alignment to index union
s1 = pd.Series(np.arange(5), index=np.arange(5))
s2 = pd.Series(np.arange(5, 10), index=np.arange(1, 6))
s1 + s2  # auto NaNs for missing indices (resulting series has index union as index)
(s1 + s2).dropna()  # drop NaN values

0     NaN
1     6.0
2     8.0
3    10.0
4    12.0
5     NaN
dtype: float64

1     6.0
2     8.0
3    10.0
4    12.0
dtype: float64

In [17]:
# assign method : can pass arrays, series or lambda function (on the current df)
df.assign(F=lambda x: x['E']+ x['D'])

Unnamed: 0,A,B,C,D,E,M,N,F
0,1.0,-1.421001,1.497859,1.027689,,-1,,
1,1.0,1.230444,0.79299,-1.539267,,-1,,
2,-2.0,-0.439513,1.29659,-0.584431,1.0,-3,-3.0,0.415569
3,-2.0,-0.533546,-0.53082,-0.288116,1.0,-2,,0.711884
4,-2.0,1.313871,-0.028742,0.478155,1.0,-2,,1.478155


In [18]:
x = 0.5
df.query('E > @x')  # query : pass str with col names, @ to unpack exterior variables

Unnamed: 0,A,B,C,D,E,M,N
2,-2.0,-0.439513,1.29659,-0.584431,1.0,-3,-3.0
3,-2.0,-0.533546,-0.53082,-0.288116,1.0,-2,
4,-2.0,1.313871,-0.028742,0.478155,1.0,-2,


In [19]:
df.loc[2]  # row selection
df.loc[2] = -4  # assignment also works
df

A   -2.000000
B   -0.439513
C    1.296590
D   -0.584431
E    1.000000
M   -3.000000
N   -3.000000
Name: 2, dtype: float64

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.421001,1.497859,1.027689,,-1,
1,1.0,1.230444,0.79299,-1.539267,,-1,
2,-4.0,-4.0,-4.0,-4.0,-4.0,-4,-4.0
3,-2.0,-0.533546,-0.53082,-0.288116,1.0,-2,
4,-2.0,1.313871,-0.028742,0.478155,1.0,-2,


In [20]:
df.loc[8] = 4  # assinging to non-existent row
df

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.421001,1.497859,1.027689,,-1,
1,1.0,1.230444,0.79299,-1.539267,,-1,
2,-4.0,-4.0,-4.0,-4.0,-4.0,-4,-4.0
3,-2.0,-0.533546,-0.53082,-0.288116,1.0,-2,
4,-2.0,1.313871,-0.028742,0.478155,1.0,-2,
8,4.0,4.0,4.0,4.0,4.0,4,4.0


In [21]:
df1 = df.copy()
df1.loc[1] = 5
df1.combine_first(df)  # combine dfs, priority to the first df

Unnamed: 0,A,B,C,D,E,M,N
0,1.0,-1.421001,1.497859,1.027689,,-1,
1,5.0,5.0,5.0,5.0,5.0,5,5.0
2,-4.0,-4.0,-4.0,-4.0,-4.0,-4,-4.0
3,-2.0,-0.533546,-0.53082,-0.288116,1.0,-2,
4,-2.0,1.313871,-0.028742,0.478155,1.0,-2,
8,4.0,4.0,4.0,4.0,4.0,4,4.0


In [22]:
df['A'].idxmin()  # idx min

2

### multi-index indexing

In [23]:
dfmi['X']  # outer level indexing
dfmi.loc['a']
dfmi.loc['a', 'X']

Unnamed: 0,Unnamed: 1,A,B
a,0,1.609335,-0.42889
b,1,1.522344,0.729334
a,2,0.17901,-1.134496
b,3,-1.82098,-1.81704
b,4,0.04826,-0.515004


Unnamed: 0_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,A,B,C,D,E
0,1.609335,-0.42889,-0.874882,1.653949,-0.159041
2,0.17901,-1.134496,0.000474,-0.325043,-0.536254


Unnamed: 0,A,B
0,1.609335,-0.42889
2,0.17901,-1.134496


In [24]:
dfmi[('X', 'A')]  # multiple level indexing
dfmi.loc[('a', 2)]
dfmi.loc[('a', 2), ('X', 'A')]

#dfmi.loc[('b', [3,4])]  # won't work
dfmi.loc['b'].loc[[3,4]]
dfmi.loc['b'].loc[slice(3,4)]

a  0    1.609335
b  1    1.522344
a  2    0.179010
b  3   -1.820980
   4    0.048260
Name: (X, A), dtype: float64

X  A    0.179010
   B   -1.134496
Y  C    0.000474
   D   -0.325043
   E   -0.536254
Name: (a, 2), dtype: float64

0.17900978920438212

Unnamed: 0_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,A,B,C,D,E
3,-1.82098,-1.81704,1.10203,0.220951,2.631492
4,0.04826,-0.515004,0.546239,0.495834,-0.609241


Unnamed: 0_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,A,B,C,D,E
3,-1.82098,-1.81704,1.10203,0.220951,2.631492
4,0.04826,-0.515004,0.546239,0.495834,-0.609241


In [25]:
dfmi.loc[('b', [3,10]), :]  # won't complain that 10 is missing (will simply skip)
# dfmi.loc['b'].loc[[3, 10]]  # won't work, 10 is missing key
# dfmi.loc[('b', [3,10])]  # also won't work

Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
b,3,-1.82098,-1.81704,1.10203,0.220951,2.631492


In [26]:
# multi-index assignment

dfmi.loc[('a', 0), ('X', 'A')] = -5  # individual
dfmi

dfmi['X'] = rng.standard_normal(size=(5, 2))  # to outter level column
dfmi[('X', 'A')] = rng.standard_normal(size=(5, 1))  # to multiple column levels

dfmi.loc['a', 'X'] = rng.standard_normal(size=(2, 2))  # to col-row selection (outer)
dfmi.loc[('a', [0, 2]), 'X'] = rng.standard_normal(size=(2, 2))  # mixed
dfmi.loc['b', ('Y', slice('C', 'D'))] = rng.standard_normal(size=(3, 2))  # using slice
dfmi.loc[:, ('Y', slice('C', 'D'))] = rng.standard_normal(size=(5, 2))  # skipping level

Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,-5.0,-0.42889,-0.874882,1.653949,-0.159041
b,1,1.522344,0.729334,-0.427182,1.439313,-0.989335
a,2,0.17901,-1.134496,0.000474,-0.325043,-0.536254
b,3,-1.82098,-1.81704,1.10203,0.220951,2.631492
b,4,0.04826,-0.515004,0.546239,0.495834,-0.609241


## xarray and multi-index dataframes

In [27]:
dfmi.columns.get_level_values(1)

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [28]:
dfmi.to_xarray()

In [29]:
xarr = df.to_xarray()

In [30]:
dfmi_c = dfmi.copy()
dfmi_c.index = dfmi_c.index.droplevel(0)
dfmi_c

Unnamed: 0_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,A,B,C,D,E
0,-1.041204,0.358264,0.253278,0.465019,-0.159041
1,0.737073,-0.961975,-0.24692,-0.082882,-0.989335
2,1.568744,1.113205,-0.368844,0.472108,-0.536254
3,-0.794228,-2.336303,0.578159,1.023879,2.631492
4,-0.585514,-0.978626,-1.653375,-0.271403,-0.609241


In [31]:
# xr data array from simple pandas dataframe
df_tmp = dfmi_c['X']
df_tmp.index = df_tmp.index.droplevel(0)
df_tmp.index.name = 'index'
df_tmp.columns.name = 'cols'

xarr = xr.DataArray(df_tmp)  
xarr

ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.

In [32]:
# xr data array from MultiIndex dataframe
df_tmp = dfmi_c.copy()
df_tmp.index = df_tmp.index.droplevel(0)
df_tmp.index.name = 'index'
df_tmp.columns.names = ['outer',  'inner']

df_tmp = df_tmp.stack('outer')  # outer columns level will be the fields inside dataset (not symbols)
df_tmp.to_xarray()

ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.

In [33]:
xds = xr.Dataset(
     {
         "X": (
             ("index", "cols"),
             dfmi_c['X'].to_numpy(),
         )
     },
     coords={"index": dfmi_c['X'].index, "cols": dfmi_c['X'].columns},
 )

In [34]:
df_tmp = dfmi_c['X']**2
df_tmp.index.name = 'index'
df_tmp.columns.name = 'cols'
xds = xds.assign(Y=df_tmp)

In [35]:
xds.to_dataframe().unstack('cols')

Unnamed: 0_level_0,X,X,Y,Y
cols,A,B,A,B
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,-1.041204,0.358264,1.084106,0.128353
1,0.737073,-0.961975,0.543276,0.925396
2,1.568744,1.113205,2.460958,1.239226
3,-0.794228,-2.336303,0.630798,5.458312
4,-0.585514,-0.978626,0.342827,0.95771


In [36]:
xarr.dim_1

AttributeError: 'Dataset' object has no attribute 'dim_1'

### more methods

In [37]:
s = pd.Series([1, 2, 2, 1, 3, 4, 1])
s.value_counts()  # count frequencies

1    3
2    2
4    1
3    1
dtype: int64

In [38]:
# cut qcut
arr = rng.normal(1, size=100)

pd.cut(arr, [-2, -1, 0, 1, 2])  # specify bin for each value, arg = bin edges

pd.qcut(arr, 10)  # same but bin edges are given by quantiles

[(-1, 0], (0, 1], (0, 1], (0, 1], (1, 2], ..., (1.0, 2.0], (1.0, 2.0], (1.0, 2.0], (1.0, 2.0], NaN]
Length: 100
Categories (4, interval[int64]): [(-2, -1] < (-1, 0] < (0, 1] < (1, 2]]

[(-0.546, 0.136], (-0.546, 0.136], (0.545, 0.953], (0.545, 0.953], (1.381, 1.531], ..., (1.381, 1.531], (1.205, 1.381], (1.381, 1.531], (1.205, 1.381], (2.191, 3.128]]
Length: 100
Categories (10, interval[float64]): [(-1.9649999999999999, -0.546] < (-0.546, 0.136] < (0.136, 0.545] < (0.545, 0.953] ... (1.381, 1.531] < (1.531, 1.71] < (1.71, 2.191] < (2.191, 3.128]]

In [39]:
df = pd.DataFrame(rng.normal(1, size=(10, 5)))

df.agg(['sum', 'mean', lambda x: x.sum()/(x**2).sum()])  # aggregation using specification or function

df.agg({2: 'mean', 4: lambda x: x.sum()/(x**2).sum()})  # aggregation with dict (keys are some columns)

Unnamed: 0,0,1,2,3,4
sum,9.458429,12.964575,7.242496,15.313474,9.422411
mean,0.945843,1.296457,0.72425,1.531347,0.942241
<lambda_0>,0.612693,0.523325,0.963734,0.401169,0.567772


2    0.724250
4    0.567772
dtype: float64

In [40]:
# transform : with 1 function similar to ufunc, with multiple functions : multiindex is returned
df.transform([np.abs, np.sign])  

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4
Unnamed: 0_level_1,absolute,sign,absolute,sign,absolute,sign,absolute,sign,absolute,sign
0,0.283112,-1.0,0.528012,1.0,1.170206,1.0,3.060797,1.0,0.73659,1.0
1,2.021603,1.0,1.033614,1.0,1.129234,1.0,0.026813,-1.0,2.209747,1.0
2,1.536799,1.0,0.444733,1.0,0.010807,1.0,0.874342,1.0,0.642048,-1.0
3,0.404857,1.0,0.926538,1.0,0.426954,1.0,2.955144,1.0,1.23705,1.0
4,1.558058,1.0,3.550563,1.0,0.826359,1.0,1.244988,1.0,0.319824,1.0
5,1.059222,1.0,0.600856,1.0,0.169668,-1.0,2.885923,1.0,1.346565,1.0
6,0.856279,1.0,2.011052,1.0,1.040916,1.0,2.083728,1.0,2.350831,1.0
7,1.22711,1.0,0.949256,1.0,1.352407,1.0,0.062873,1.0,1.260604,1.0
8,1.62868,1.0,1.148498,1.0,0.777598,1.0,0.080225,-1.0,0.216971,1.0
9,0.551067,-1.0,1.771452,1.0,0.677683,1.0,2.252717,1.0,0.386277,1.0


In [41]:
s = pd.Series(rng.standard_normal(size=5), index=['a', 'b', 'c', 'd', 'e'])

s.reindex(['e', 'b', 'f', 'd'])
#s.loc[['e', 'b', 'f', 'd']]  # this won't work

s.index = pd.MultiIndex.from_tuples(zip(s.index, range(5)))
s
s.loc[(['e', 'b', 'f', 'd'], 3)]  # for multiindex loc - this call won't complain about missing value 'f', would return empty Series if just 'f' passed

e    0.992928
b    0.912008
f         NaN
d   -0.729259
dtype: float64

a  0   -0.909457
b  1    0.912008
c  2    0.477306
d  3   -0.729259
e  4    0.992928
dtype: float64

d  3   -0.729259
dtype: float64

In [42]:
s.reindex_like(s.iloc[:3])

a  0   -0.909457
b  1    0.912008
c  2    0.477306
dtype: float64

In [43]:
# aligning

s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s1 = s[:4]
s2 = s[1:]

s1.align(s2)
s1.align(s2, join='inner')  # index intersection
s1.align(s2, join='left')  # index from s1

(a    1.020944
 b    0.261951
 c    0.626301
 d   -0.329211
 e         NaN
 dtype: float64,
 a         NaN
 b    0.261951
 c    0.626301
 d   -0.329211
 e    0.810246
 dtype: float64)

(b    0.261951
 c    0.626301
 d   -0.329211
 dtype: float64,
 b    0.261951
 c    0.626301
 d   -0.329211
 dtype: float64)

(a    1.020944
 b    0.261951
 c    0.626301
 d   -0.329211
 dtype: float64,
 a         NaN
 b    0.261951
 c    0.626301
 d   -0.329211
 dtype: float64)

In [44]:
s = pd.Series([1, 2, 3, np.nan, np.nan, np.nan, 3, 2, np.nan])
s

s.ffill(limit=2)  # ffill max, limit specifies the max ffill horizon
# tolerance argument specifies the max index distance for ffill

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    NaN
6    3.0
7    2.0
8    NaN
dtype: float64

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    NaN
6    3.0
7    2.0
8    2.0
dtype: float64

Index(['a', 'd'], dtype='object')

Index(['b', 'c', 'e'], dtype='object')

In [45]:
s = pd.Series(0, index=list('abcde'))
s.rename(str.upper)  # rename take a function to be applied to index, cols, or a dict#
s.rename({'a': 1, 'b': 2})

A    0
B    0
C    0
D    0
E    0
dtype: int64

1    0
2    0
c    0
d    0
e    0
dtype: int64

* iterating through pandas rows or columns is generally quite slow
* never modify a df while iterating over it

In [46]:
df = pd.DataFrame(rng.standard_normal(size=(10, 5)), columns=list('ABCDE'))

for idx, row in df.iterrows():
    print(idx, row['A'])

print('\n')

for col, ser in df.items():
    print(col, ser[0])


0 -1.564875638841265
1 -0.00975275883130248
2 -0.7979255264982319
3 1.025141820130099
4 0.07116824414361307
5 0.007518299876764157
6 -0.43202814627277786
7 -0.1133377704752121
8 -1.7443064859078656
9 1.2221873366418237


A -1.564875638841265
B 0.6055413419407139
C -0.7650759762582486
D 1.3299661267888203
E 1.4840629313670912


In [47]:
# special accessors
s = pd.Series(['a', 'b', 'c'])
s
s.str.upper()  # str methods

s = pd.Series(pd.date_range('2020-01-01', '2020-01-03'))
s.dt.hour  # datetime methods

0    a
1    b
2    c
dtype: object

0    A
1    B
2    C
dtype: object

0    0
1    0
2    0
dtype: int64

In [48]:
s.dt.tz_localize('UTC')
s.to_numpy()

0   2020-01-01 00:00:00+00:00
1   2020-01-02 00:00:00+00:00
2   2020-01-03 00:00:00+00:00
dtype: datetime64[ns, UTC]

array(['2020-01-01T00:00:00.000000000', '2020-01-02T00:00:00.000000000',
       '2020-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

pandas mostly uses dtype from numpy

In [49]:
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'),
 'B': pd.Series(np.random.randn(8)),
 'C': pd.Series(np.array(np.random.randn(8),
 dtype='uint8'))})

df2.dtypes  # dtypes are preserved 

A    float16
B    float64
C      uint8
dtype: object

### indexing

In [50]:
df['A']
df.loc[2]

0   -1.564876
1   -0.009753
2   -0.797926
3    1.025142
4    0.071168
5    0.007518
6   -0.432028
7   -0.113338
8   -1.744306
9    1.222187
Name: A, dtype: float64

A   -0.797926
B   -2.035976
C    0.444523
D    0.884394
E    1.729274
Name: 2, dtype: float64

In [51]:
dfmi['X']

# dfmi['W']  # key error

dfmi[('X', 'A')]  
#dfmi[('W', 'A')]  # won't work, key error

dfmi.loc[:, (['W'], 'A')]  # returns empty df, despite missing col 'W', won't complain

Unnamed: 0,Unnamed: 1,A,B
a,0,-1.041204,0.358264
b,1,0.737073,-0.961975
a,2,1.568744,1.113205
b,3,-0.794228,-2.336303
b,4,-0.585514,-0.978626


a  0   -1.041204
b  1    0.737073
a  2    1.568744
b  3   -0.794228
   4   -0.585514
Name: (X, A), dtype: float64

a,0
b,1
a,2
b,3
b,4


In [52]:
dfmi_c = dfmi.copy()
dfmi_c.loc[:, ('X', slice('A', 'C'))] = -1  # use loc to assign
dfmi_c

dfmi_c = dfmi.copy()
dfmi_c.loc[:, ('X', ['A', 'B'])] = np.ones((5, 2))  # use slice to assign to multiple MI cols
dfmi_c

dfmi_c = dfmi.copy()
dfmi_c.loc[:, ('X', slice(None))] = np.ones((5, 2))  # slice(None) : assign to all
dfmi_c

Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,-1,-1,0.253278,0.465019,-0.159041
b,1,-1,-1,-0.24692,-0.082882,-0.989335
a,2,-1,-1,-0.368844,0.472108,-0.536254
b,3,-1,-1,0.578159,1.023879,2.631492
b,4,-1,-1,-1.653375,-0.271403,-0.609241


Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,1.0,1.0,0.253278,0.465019,-0.159041
b,1,1.0,1.0,-0.24692,-0.082882,-0.989335
a,2,1.0,1.0,-0.368844,0.472108,-0.536254
b,3,1.0,1.0,0.578159,1.023879,2.631492
b,4,1.0,1.0,-1.653375,-0.271403,-0.609241


Unnamed: 0_level_0,Unnamed: 1_level_0,X,X,Y,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,C,D,E
a,0,1.0,1.0,0.253278,0.465019,-0.159041
b,1,1.0,1.0,-0.24692,-0.082882,-0.989335
a,2,1.0,1.0,-0.368844,0.472108,-0.536254
b,3,1.0,1.0,0.578159,1.023879,2.631492
b,4,1.0,1.0,-1.653375,-0.271403,-0.609241


## concat, merge, join

### concat

In [53]:
df1 = pd.DataFrame(rng.standard_normal(size=(7, 3)), index=list('abcdefg'))
df2 = pd.DataFrame(rng.standard_normal(size=(5, 3)), index=list('fgxyz'))

In [54]:
# plain concat, indexes are duplicated if necessary
pd.concat([df1, df2])

Unnamed: 0,0,1,2
a,-0.09987,2.283947,-0.746802
b,0.561272,-0.758681,-1.464779
c,-0.676359,0.347864,1.157671
d,1.082971,-0.507684,0.068476
e,-0.053727,-0.46688,-2.188589
f,-1.687722,-0.453636,1.449944
g,0.418448,0.136925,-0.459506
f,1.356902,1.024216,1.158559
g,1.032039,0.085899,-1.775441
x,0.834134,1.078912,0.273977


In [55]:
# can specify axis
pd.concat([df1, df2], axis=1)

Unnamed: 0,0,1,2,0.1,1.1,2.1
a,-0.09987,2.283947,-0.746802,,,
b,0.561272,-0.758681,-1.464779,,,
c,-0.676359,0.347864,1.157671,,,
d,1.082971,-0.507684,0.068476,,,
e,-0.053727,-0.46688,-2.188589,,,
f,-1.687722,-0.453636,1.449944,1.356902,1.024216,1.158559
g,0.418448,0.136925,-0.459506,1.032039,0.085899,-1.775441
x,,,,0.834134,1.078912,0.273977
y,,,,-0.042586,1.973453,-1.165919
z,,,,-0.142268,1.289577,1.686377


In [56]:
# ignore index, replace with range
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,0,1,2
0,-0.09987,2.283947,-0.746802
1,0.561272,-0.758681,-1.464779
2,-0.676359,0.347864,1.157671
3,1.082971,-0.507684,0.068476
4,-0.053727,-0.46688,-2.188589
5,-1.687722,-0.453636,1.449944
6,0.418448,0.136925,-0.459506
7,1.356902,1.024216,1.158559
8,1.032039,0.085899,-1.775441
9,0.834134,1.078912,0.273977


In [57]:
pd.concat([df1, df2], axis=1, join='inner')  # default 'outer' (for union), 'inner' for intersection

Unnamed: 0,0,1,2,0.1,1.1,2.1
f,-1.687722,-0.453636,1.449944,1.356902,1.024216,1.158559
g,0.418448,0.136925,-0.459506,1.032039,0.085899,-1.775441


In [58]:
# add extract index level with given keys
pd.concat([df1, df2], keys=[1, 2])

Unnamed: 0,Unnamed: 1,0,1,2
1,a,-0.09987,2.283947,-0.746802
1,b,0.561272,-0.758681,-1.464779
1,c,-0.676359,0.347864,1.157671
1,d,1.082971,-0.507684,0.068476
1,e,-0.053727,-0.46688,-2.188589
1,f,-1.687722,-0.453636,1.449944
1,g,0.418448,0.136925,-0.459506
2,f,1.356902,1.024216,1.158559
2,g,1.032039,0.085899,-1.775441
2,x,0.834134,1.078912,0.273977


In [59]:
df1.append(df2)  # alias for simple concat
df1.append([df1, df2])  # can take a list of frames
# append creates a copy and does not modify df1

Unnamed: 0,0,1,2
a,-0.09987,2.283947,-0.746802
b,0.561272,-0.758681,-1.464779
c,-0.676359,0.347864,1.157671
d,1.082971,-0.507684,0.068476
e,-0.053727,-0.46688,-2.188589
f,-1.687722,-0.453636,1.449944
g,0.418448,0.136925,-0.459506
f,1.356902,1.024216,1.158559
g,1.032039,0.085899,-1.775441
x,0.834134,1.078912,0.273977


Unnamed: 0,0,1,2
a,-0.09987,2.283947,-0.746802
b,0.561272,-0.758681,-1.464779
c,-0.676359,0.347864,1.157671
d,1.082971,-0.507684,0.068476
e,-0.053727,-0.46688,-2.188589
f,-1.687722,-0.453636,1.449944
g,0.418448,0.136925,-0.459506
a,-0.09987,2.283947,-0.746802
b,0.561272,-0.758681,-1.464779
c,-0.676359,0.347864,1.157671


In [60]:
type(df2[0])
pd.concat([df1, df2[0]], axis=1)  # concat dataframe with a series

pandas.core.series.Series

Unnamed: 0,0,1,2,0.1
a,-0.09987,2.283947,-0.746802,
b,0.561272,-0.758681,-1.464779,
c,-0.676359,0.347864,1.157671,
d,1.082971,-0.507684,0.068476,
e,-0.053727,-0.46688,-2.188589,
f,-1.687722,-0.453636,1.449944,1.356902
g,0.418448,0.136925,-0.459506,1.032039
x,,,,0.834134
y,,,,-0.042586
z,,,,-0.142268


### merge

In [69]:
# merge : takes 2 args with are dataframe or series

pd.merge(df1, df2, how='outer', on=0, suffixes=('A', 'B'))

Unnamed: 0,0,1A,2A,1B,2B
0,-0.09987,2.283947,-0.746802,,
1,0.561272,-0.758681,-1.464779,,
2,-0.676359,0.347864,1.157671,,
3,1.082971,-0.507684,0.068476,,
4,-0.053727,-0.46688,-2.188589,,
5,-1.687722,-0.453636,1.449944,,
6,0.418448,0.136925,-0.459506,,
7,1.356902,,,1.024216,1.158559
8,1.032039,,,0.085899,-1.775441
9,0.834134,,,1.078912,0.273977


In [62]:
pd.merge(df1, df2, how='left')

Unnamed: 0,0,1,2
0,-0.09987,2.283947,-0.746802
1,0.561272,-0.758681,-1.464779
2,-0.676359,0.347864,1.157671
3,1.082971,-0.507684,0.068476
4,-0.053727,-0.46688,-2.188589
5,-1.687722,-0.453636,1.449944
6,0.418448,0.136925,-0.459506


In [63]:
df1

Unnamed: 0,0,1,2
a,-0.09987,2.283947,-0.746802
b,0.561272,-0.758681,-1.464779
c,-0.676359,0.347864,1.157671
d,1.082971,-0.507684,0.068476
e,-0.053727,-0.46688,-2.188589
f,-1.687722,-0.453636,1.449944
g,0.418448,0.136925,-0.459506


In [64]:
from marketdata.statics import get_country_statics, get_currency_statics
currency_statics = get_currency_statics()
country_statics = get_country_statics()

In [65]:
currency_statics

Unnamed: 0_level_0,country,name,number
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AFN,Afghanistan,Afghani,971.0
EUR,Aland Islands,Euro,978.0
ALL,Albania,Lek,8.0
DZD,Algeria,Algerian Dinar,12.0
USD,American Samoa,US Dollar,840.0
...,...,...,...
XPF,Wallis and Futuna,CFP Franc,953.0
MAD,Western Sahara,Moroccan Dirham,504.0
YER,Yemen,Yemeni Rial,886.0
ZMW,Zambia,Zambian Kwacha,967.0


In [66]:
country_statics

Unnamed: 0,iso_code2,iso_code3,numeric
Afghanistan,AF,AFG,4
Aland Islands,AX,ALA,248
Albania,AL,ALB,8
Algeria,DZ,DZA,12
American Samoa,AS,ASM,16
...,...,...,...
Wallis and Futuna,WF,WLF,876
Western Sahara,EH,ESH,732
Yemen,YE,YEM,887
Zambia,ZM,ZMB,894


In [68]:
pd.merge(country_statics, currency_statics, left_index=True, right_on='country', how='outer')

Unnamed: 0,iso_code2,iso_code3,numeric,country,name,number
AFN,AF,AFG,4.0,Afghanistan,Afghani,971.0
EUR,AX,ALA,248.0,Aland Islands,Euro,978.0
ALL,AL,ALB,8.0,Albania,Lek,8.0
DZD,DZ,DZA,12.0,Algeria,Algerian Dinar,12.0
USD,AS,ASM,16.0,American Samoa,US Dollar,840.0
...,...,...,...,...,...,...
MAD,EH,ESH,732.0,Western Sahara,Moroccan Dirham,504.0
YER,YE,YEM,887.0,Yemen,Yemeni Rial,886.0
ZMW,ZM,ZMB,894.0,Zambia,Zambian Kwacha,967.0
ZWL,ZW,ZWE,716.0,Zimbabwe,Zimbabwe Dollar,932.0


In [79]:
import sqlite3

try:
    sqliteConnection = sqlite3.connect('SQLite_Python.db')
    cursor = sqliteConnection.cursor()
    print("Database created and Successfully Connected to SQLite")

    sqlite_select_Query = "select sqlite_version();"
    cursor.execute(sqlite_select_Query)
    record = cursor.fetchall()
    print("SQLite Database Version is: ", record)
    cursor.close()

except sqlite3.Error as error:
    print("Error while connecting to sqlite", error)
finally:
    if (sqliteConnection):
        sqliteConnection.close()
        print("The SQLite connection is closed")

Database created and Successfully Connected to SQLite


<sqlite3.Cursor at 0x7f6ffcfcdc70>

SQLite Database Version is:  [('3.32.3',)]
The SQLite connection is closed


In [80]:
from marketdata.daily import get_daily_data

In [83]:
currencies = ['EUR', 'CAD', 'SEK', 'JPY']
fx_rates = get_daily_data(currencies, base='USD')

In [84]:
fx_rates

In [91]:
currency_statics

Unnamed: 0_level_0,country,name,number
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AFN,Afghanistan,Afghani,971.0
EUR,Aland Islands,Euro,978.0
ALL,Albania,Lek,8.0
DZD,Algeria,Algerian Dinar,12.0
USD,American Samoa,US Dollar,840.0
...,...,...,...
XPF,Wallis and Futuna,CFP Franc,953.0
MAD,Western Sahara,Moroccan Dirham,504.0
YER,Yemen,Yemeni Rial,886.0
ZMW,Zambia,Zambian Kwacha,967.0


In [111]:
fx_rates_df = pd.concat({key: fx_rates[key].to_pandas().T for key in list(fx_rates.keys())}, axis=1)
fx_rates_df.columns.names = ['field', 'currency']

In [113]:
fx_rates_df

field,close,close,close,close,high,...,low,open,open,open,open
currency,CAD,EUR,JPY,SEK,CAD,...,SEK,CAD,EUR,JPY,SEK
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2020-04-13,0.7198,1.0915,0.0093,0.0995,0.7212,...,0.0993,0.7147,1.0937,0.0092,0.1001
2020-04-14,0.7202,1.0983,0.0093,0.1002,0.7209,...,0.0995,0.7198,1.0915,0.0093,0.0995
2020-04-15,0.7081,1.0901,0.0093,0.0992,0.7205,...,0.0989,0.7199,1.0984,0.0093,0.1002
2020-04-16,0.7120,1.0860,0.0093,0.0994,0.7131,...,0.0987,0.7081,1.0902,0.0093,0.0992
2020-04-17,0.7138,1.0874,0.0093,0.0997,0.7139,...,0.0990,0.7118,1.0860,0.0093,0.0994
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-24,0.7563,1.1794,0.0094,0.1133,0.7611,...,0.1131,0.7589,1.1801,0.0094,0.1133
2020-08-25,0.7587,1.1833,0.0094,0.1138,0.7592,...,0.1132,0.7563,1.1794,0.0094,0.1133
2020-08-26,0.7603,1.1835,0.0094,0.1142,0.7613,...,0.1133,0.7591,1.1833,0.0094,0.1137
2020-08-27,0.7616,1.1817,0.0094,0.1141,0.7630,...,0.1135,0.7608,1.1836,0.0094,0.1142


In [152]:
currency_to_country = currency_statics['country'].drop_duplicates()

In [154]:
df = pd.merge(currency_to_country, fx_rates_df.T, left_index=True, right_on='currency')
df = df.sort_index()
df = df.reset_index().set_index(df.index.names + ['country']).T

In [155]:
df

field,close,close,close,close,close,...,open,open,open,open,open
currency,CAD,EUR,EUR,EUR,EUR,...,EUR,EUR,EUR,JPY,SEK
country,Canada,Aland Islands,Andorra,Austria,Belgium,...,Slovakia,Slovenia,Spain,Japan,Sweden
2020-04-13,0.7198,1.0915,1.0915,1.0915,1.0915,...,1.0937,1.0937,1.0937,0.0092,0.1001
2020-04-14,0.7202,1.0983,1.0983,1.0983,1.0983,...,1.0915,1.0915,1.0915,0.0093,0.0995
2020-04-15,0.7081,1.0901,1.0901,1.0901,1.0901,...,1.0984,1.0984,1.0984,0.0093,0.1002
2020-04-16,0.7120,1.0860,1.0860,1.0860,1.0860,...,1.0902,1.0902,1.0902,0.0093,0.0992
2020-04-17,0.7138,1.0874,1.0874,1.0874,1.0874,...,1.0860,1.0860,1.0860,0.0093,0.0994
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-24,0.7563,1.1794,1.1794,1.1794,1.1794,...,1.1801,1.1801,1.1801,0.0094,0.1133
2020-08-25,0.7587,1.1833,1.1833,1.1833,1.1833,...,1.1794,1.1794,1.1794,0.0094,0.1133
2020-08-26,0.7603,1.1835,1.1835,1.1835,1.1835,...,1.1833,1.1833,1.1833,0.0094,0.1137
2020-08-27,0.7616,1.1817,1.1817,1.1817,1.1817,...,1.1836,1.1836,1.1836,0.0094,0.1142


In [172]:
tmp = pd.merge(country_statics['iso_code2'], df.T, left_index=True, right_on=df.columns.get_level_values('country')).drop('key_0', axis=1)
tmp = tmp.reset_index().set_index(tmp.index.names + ['iso_code2']).sort_index().T

In [174]:
tmp['close']

currency,CAD,EUR,EUR,EUR,EUR,EUR,EUR,EUR,EUR,JPY,SEK
country,Canada,Aland Islands,Andorra,Austria,Belgium,...,Slovakia,Slovenia,Spain,Japan,Sweden
iso_code2,CA,AX,AD,AT,BE,...,SK,SI,ES,JP,SE
2020-04-13,0.7198,1.0915,1.0915,1.0915,1.0915,...,1.0915,1.0915,1.0915,0.0093,0.0995
2020-04-14,0.7202,1.0983,1.0983,1.0983,1.0983,...,1.0983,1.0983,1.0983,0.0093,0.1002
2020-04-15,0.7081,1.0901,1.0901,1.0901,1.0901,...,1.0901,1.0901,1.0901,0.0093,0.0992
2020-04-16,0.7120,1.0860,1.0860,1.0860,1.0860,...,1.0860,1.0860,1.0860,0.0093,0.0994
2020-04-17,0.7138,1.0874,1.0874,1.0874,1.0874,...,1.0874,1.0874,1.0874,0.0093,0.0997
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-24,0.7563,1.1794,1.1794,1.1794,1.1794,...,1.1794,1.1794,1.1794,0.0094,0.1133
2020-08-25,0.7587,1.1833,1.1833,1.1833,1.1833,...,1.1833,1.1833,1.1833,0.0094,0.1138
2020-08-26,0.7603,1.1835,1.1835,1.1835,1.1835,...,1.1835,1.1835,1.1835,0.0094,0.1142
2020-08-27,0.7616,1.1817,1.1817,1.1817,1.1817,...,1.1817,1.1817,1.1817,0.0094,0.1141


In [176]:
dfc = tmp['close']

In [192]:
dfc2 = dfc.droplevel(['currency', 'country'], axis=1)
dfc2.index.name = 'date'

In [185]:
statics_df = dfc.columns.to_frame(index=False).set_index('iso_code2')

In [193]:
xr.DataArray(dfc2)

In [204]:
dst = xr.Dataset()
dst['close'] = xr.DataArray(dfc2)
dst = dst.assign(statics_df.to_xarray())

In [286]:
statics_df.to_xarray()  # create xarray.Dataset with columns as separate fields
# can add it to existing dataset using assign

xr.Dataset.from_dataframe(statics_df)  # another way to do the same thing

In [224]:
# select from dataset, pass dict with coordinates as keys and values/slices/boolean indexes as values
dst.sel(dict(iso_code2=dst.iso_code2.isin(['CA', 'AT', 'ES']))) 

In [235]:
open_df = tmp['open'].droplevel(['currency', 'country'], axis=1)
open_df.index.name = 'date'
dst['open'] = open_df

In [236]:
list(dst.keys())  # get variable list

['close', 'currency', 'country', 'open']

In [237]:
dst

In [239]:
dst.sel(dict(iso_code2=['CA', 'ES']))

In [250]:
dst['date'] = pd.to_datetime(dst['date'].values)
dst.sel(dict(date='2020-04'))  # pandas datetime indexing also works from within xarray

In [252]:
dst[['close', 'open', 'currency']]  # select varible subset

In [257]:
dst.dims  # dimensions

Frozen(SortedKeysDict({'date': 100, 'iso_code2': 35}))

In [262]:
country_statics.index.name = 'name'
cst =country_statics.reset_index().set_index('iso_code2')
cst.columns.name = 'static_field'

In [267]:
dst['static'] = cst

In [268]:
dst

In [269]:
dst.sel(dict(iso_code2=['CA', 'ES', 'AD']))

In [278]:
month_ser = dst.date.to_pandas().dt.month  # dt specifier for pandas to access datetime methods (similar to str)
year_ser = dst.date.to_pandas().dt.year

In [281]:
date_info = pd.concat([month_ser, year_ser], keys=['month', 'year'], axis=1)
date_info.index.name = 'date'
date_info.columns.name = 'component'

In [283]:
dst['date_info'] = date_info

In [284]:
dst

In [285]:
dst[['open', 'close']]  # only relevant coordinates are left

In [289]:
dst[['open', 'close']].to_dataframe().unstack('iso_code2')  # MI dataframe from xr.Dataset fields

Unnamed: 0_level_0,open,open,open,open,open,...,close,close,close,close,close
iso_code2,AD,AT,AX,BE,CA,...,SK,SM,TF,VA,YT
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2020-04-13,1.0937,1.0937,1.0937,1.0937,0.7147,...,1.0915,1.0915,1.0915,1.0915,1.0915
2020-04-14,1.0915,1.0915,1.0915,1.0915,0.7198,...,1.0983,1.0983,1.0983,1.0983,1.0983
2020-04-15,1.0984,1.0984,1.0984,1.0984,0.7199,...,1.0901,1.0901,1.0901,1.0901,1.0901
2020-04-16,1.0902,1.0902,1.0902,1.0902,0.7081,...,1.0860,1.0860,1.0860,1.0860,1.0860
2020-04-17,1.0860,1.0860,1.0860,1.0860,0.7118,...,1.0874,1.0874,1.0874,1.0874,1.0874
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-24,1.1801,1.1801,1.1801,1.1801,0.7589,...,1.1794,1.1794,1.1794,1.1794,1.1794
2020-08-25,1.1794,1.1794,1.1794,1.1794,0.7563,...,1.1833,1.1833,1.1833,1.1833,1.1833
2020-08-26,1.1833,1.1833,1.1833,1.1833,0.7591,...,1.1835,1.1835,1.1835,1.1835,1.1835
2020-08-27,1.1836,1.1836,1.1836,1.1836,0.7608,...,1.1817,1.1817,1.1817,1.1817,1.1817


In [298]:
# selecting column level second field - want to create column multiindex
df_tmp = dst[['open', 'close', 'currency']].to_dataframe()
df_tmp.columns.name = 'field'  # add name to the unique column level formed from dataset variables
df2 = df_tmp.reset_index().set_index(df_tmp.index.names + ['currency']).unstack(['iso_code2', 'currency'])

In [299]:
df2

field,open,open,open,open,open,...,close,close,close,close,close
iso_code2,CA,AX,AD,AT,BE,...,SK,SI,ES,JP,SE
currency,CAD,EUR,EUR,EUR,EUR,...,EUR,EUR,EUR,JPY,SEK
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3
2020-04-13,0.7147,1.0937,1.0937,1.0937,1.0937,...,1.0915,1.0915,1.0915,0.0093,0.0995
2020-04-14,0.7198,1.0915,1.0915,1.0915,1.0915,...,1.0983,1.0983,1.0983,0.0093,0.1002
2020-04-15,0.7199,1.0984,1.0984,1.0984,1.0984,...,1.0901,1.0901,1.0901,0.0093,0.0992
2020-04-16,0.7081,1.0902,1.0902,1.0902,1.0902,...,1.0860,1.0860,1.0860,0.0093,0.0994
2020-04-17,0.7118,1.0860,1.0860,1.0860,1.0860,...,1.0874,1.0874,1.0874,0.0093,0.0997
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-24,0.7589,1.1801,1.1801,1.1801,1.1801,...,1.1794,1.1794,1.1794,0.0094,0.1133
2020-08-25,0.7563,1.1794,1.1794,1.1794,1.1794,...,1.1833,1.1833,1.1833,0.0094,0.1138
2020-08-26,0.7591,1.1833,1.1833,1.1833,1.1833,...,1.1835,1.1835,1.1835,0.0094,0.1142
2020-08-27,0.7608,1.1836,1.1836,1.1836,1.1836,...,1.1817,1.1817,1.1817,0.0094,0.1141


In [312]:
df_tmp = pd.merge(currency_statics['name'], df2.T, left_index=True, right_on=df2.T.reset_index()['currency']).drop('key_0', axis=1)

In [314]:
df_tmp = df_tmp.reset_index().set_index(df_tmp.index.names + ['name']).T
df_tmp

field,open,open,open,open,open,...,close,open,close,open,close
iso_code2,AX,AD,AT,BE,CY,...,CA,JP,JP,SE,SE
currency,EUR,EUR,EUR,EUR,EUR,...,CAD,JPY,JPY,SEK,SEK
name,Euro,Euro,Euro,Euro,Euro,...,Canadian Dollar,Yen,Yen,Swedish Krona,Swedish Krona
2020-04-13,1.0937,1.0937,1.0937,1.0937,1.0937,...,0.7198,0.0092,0.0093,0.1001,0.0995
2020-04-14,1.0915,1.0915,1.0915,1.0915,1.0915,...,0.7202,0.0093,0.0093,0.0995,0.1002
2020-04-15,1.0984,1.0984,1.0984,1.0984,1.0984,...,0.7081,0.0093,0.0093,0.1002,0.0992
2020-04-16,1.0902,1.0902,1.0902,1.0902,1.0902,...,0.7120,0.0093,0.0093,0.0992,0.0994
2020-04-17,1.0860,1.0860,1.0860,1.0860,1.0860,...,0.7138,0.0093,0.0093,0.0994,0.0997
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-24,1.1801,1.1801,1.1801,1.1801,1.1801,...,0.7563,0.0094,0.0094,0.1133,0.1133
2020-08-25,1.1794,1.1794,1.1794,1.1794,1.1794,...,0.7587,0.0094,0.0094,0.1133,0.1138
2020-08-26,1.1833,1.1833,1.1833,1.1833,1.1833,...,0.7603,0.0094,0.0094,0.1137,0.1142
2020-08-27,1.1836,1.1836,1.1836,1.1836,1.1836,...,0.7616,0.0094,0.0094,0.1142,0.1141


In [315]:
df_tmp.xs('Euro', level='name', axis=1)

field,open,open,open,open,open,...,close,close,close,close,close
iso_code2,AX,AD,AT,BE,CY,...,PM,SM,SK,SI,ES
currency,EUR,EUR,EUR,EUR,EUR,...,EUR,EUR,EUR,EUR,EUR
2020-04-13,1.0937,1.0937,1.0937,1.0937,1.0937,...,1.0915,1.0915,1.0915,1.0915,1.0915
2020-04-14,1.0915,1.0915,1.0915,1.0915,1.0915,...,1.0983,1.0983,1.0983,1.0983,1.0983
2020-04-15,1.0984,1.0984,1.0984,1.0984,1.0984,...,1.0901,1.0901,1.0901,1.0901,1.0901
2020-04-16,1.0902,1.0902,1.0902,1.0902,1.0902,...,1.0860,1.0860,1.0860,1.0860,1.0860
2020-04-17,1.0860,1.0860,1.0860,1.0860,1.0860,...,1.0874,1.0874,1.0874,1.0874,1.0874
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-24,1.1801,1.1801,1.1801,1.1801,1.1801,...,1.1794,1.1794,1.1794,1.1794,1.1794
2020-08-25,1.1794,1.1794,1.1794,1.1794,1.1794,...,1.1833,1.1833,1.1833,1.1833,1.1833
2020-08-26,1.1833,1.1833,1.1833,1.1833,1.1833,...,1.1835,1.1835,1.1835,1.1835,1.1835
2020-08-27,1.1836,1.1836,1.1836,1.1836,1.1836,...,1.1817,1.1817,1.1817,1.1817,1.1817


In [320]:
df_tmp.loc[:, (slice(None), ['AX', 'ES'])] = -1

In [321]:
df_tmp

field,open,open,open,open,open,...,close,open,close,open,close
iso_code2,AX,AD,AT,BE,CY,...,CA,JP,JP,SE,SE
currency,EUR,EUR,EUR,EUR,EUR,...,CAD,JPY,JPY,SEK,SEK
name,Euro,Euro,Euro,Euro,Euro,...,Canadian Dollar,Yen,Yen,Swedish Krona,Swedish Krona
2020-04-13,-1,1.0937,1.0937,1.0937,1.0937,...,0.7198,0.0092,0.0093,0.1001,0.0995
2020-04-14,-1,1.0915,1.0915,1.0915,1.0915,...,0.7202,0.0093,0.0093,0.0995,0.1002
2020-04-15,-1,1.0984,1.0984,1.0984,1.0984,...,0.7081,0.0093,0.0093,0.1002,0.0992
2020-04-16,-1,1.0902,1.0902,1.0902,1.0902,...,0.7120,0.0093,0.0093,0.0992,0.0994
2020-04-17,-1,1.0860,1.0860,1.0860,1.0860,...,0.7138,0.0093,0.0093,0.0994,0.0997
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-24,-1,1.1801,1.1801,1.1801,1.1801,...,0.7563,0.0094,0.0094,0.1133,0.1133
2020-08-25,-1,1.1794,1.1794,1.1794,1.1794,...,0.7587,0.0094,0.0094,0.1133,0.1138
2020-08-26,-1,1.1833,1.1833,1.1833,1.1833,...,0.7603,0.0094,0.0094,0.1137,0.1142
2020-08-27,-1,1.1836,1.1836,1.1836,1.1836,...,0.7616,0.0094,0.0094,0.1142,0.1141
