In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df1

Unnamed: 0,key,A
0,K0,A0
1,K1,A1
2,K2,A2
3,K3,A3
4,K4,A4
5,K5,A5


In [6]:
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})
df2

Unnamed: 0,key,B
0,K0,B0
1,K1,B1
2,K2,B2


Join DataFrames using their indexes.

In [7]:
df1.join(df2, lsuffix='_1', rsuffix='_2')

Unnamed: 0,key_1,A,key_2,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K2,A2,K2,B2
3,K3,A3,,
4,K4,A4,,
5,K5,A5,,


If we want to join using the key columns, we need to set key to be the index in both df and other. The joined DataFrame will have key as its index.

In [8]:
df1.set_index('key').join(df2.set_index('key'))

Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


Another option to join using the key columns is to use the on parameter. DataFrame.join always uses other’s index but we can use any column in df. This method preserves the original DataFrame’s index in the result.

In [9]:
df1.join(df2.set_index('key'), on='key')

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,
4,K4,A4,
5,K5,A5,


Using non-unique key values shows how they are matched.

In [10]:
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df1

Unnamed: 0,key,A
0,K0,A0
1,K1,A1
2,K1,A2
3,K3,A3
4,K0,A4
5,K1,A5


In [11]:
df1.join(df2.set_index('key'), on='key', validate='m:1')

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K1,A2,B1
3,K3,A3,
4,K0,A4,B0
5,K1,A5,B1


In [16]:
# to samé, asi bez validace
df1.join(df2.set_index('key'), on='key')

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K1,A2,B1
3,K3,A3,
4,K0,A4,B0
5,K1,A5,B1


In [22]:
# jen podle indexu, špatně, projde i validace
df1.join(df2, lsuffix='_1', rsuffix='_2', validate='1:1')

Unnamed: 0,key_1,A,key_2,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K1,A2,K2,B2
3,K3,A3,,
4,K0,A4,,
5,K1,A5,,


In [23]:
df1.index.is_unique

True

In [24]:
df2.index.is_unique

True

In [27]:
# úplně to samé
pd.concat([df1, df2], axis=1)

Unnamed: 0,key,A,key.1,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K1,A2,K2,B2
3,K3,A3,,
4,K0,A4,,
5,K1,A5,,


In [29]:
# dá pod sebe
pd.concat([df1, df2])

Unnamed: 0,key,A,B
0,K0,A0,
1,K1,A1,
2,K1,A2,
3,K3,A3,
4,K0,A4,
5,K1,A5,
0,K0,,B0
1,K1,,B1
2,K2,,B2


In [35]:
# správně reference, default je how = 'left' !!!
df1.join(df2.set_index('key'), on='key', validate='m:1')

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K1,A2,B1
3,K3,A3,
4,K0,A4,B0
5,K1,A5,B1


In [33]:
# inner join, bez K3, jinak OK
df1.merge(df2)

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K1,A2,B1
3,K0,A4,B0
4,K1,A5,B1


In [37]:
# stejné jako ref
df1.merge(df2, how='left')

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K1,A2,B1
3,K3,A3,
4,K0,A4,B0
5,K1,A5,B1
