# Examples in pandas for merge and concat

In [13]:
# Setup two dataframes with the same columns
import pandas as pd

COLS = ['id', 'color']
df1 = pd.DataFrame([[11,'red'],[12,'blue'],[13,'green']], columns=COLS)
df12 = pd.DataFrame([[11,'red'],[12,'green'],[14,'yellow'],[15,'yellow'],[16,'yellow']], columns=COLS)

print("df1:")
print(df1)
print("---")

print("df12:")
print(df12)

df1:
   id  color
0  11    red
1  12   blue
2  13  green
---
df12:
   id   color
0  11     red
1  12   green
2  14  yellow
3  15  yellow
4  16  yellow


## Merge using outer and inner join operations

In [14]:
print("Merged using inner join:")
df = pd.merge(df1, df12, on='id', how='inner')
print(df)
print("---")

print("Merged using outer join:")
df = pd.merge(df1, df12, on='id', how='outer')
print(df)

print()
print(">> Note the addition of NaN (Not a Number) values where no match up was found")

Merged using inner join:
   id color_x color_y
0  11     red     red
1  12    blue   green
---
Merged using outer join:
   id color_x color_y
0  11     red     red
1  12    blue   green
2  13   green     NaN
3  14     NaN  yellow
4  15     NaN  yellow
5  16     NaN  yellow

>> Note the addition of NaN (Not a Number) values where no match up was found


## Merge using left outer join operation (equivalent to Excel's popular 'vlookup' function)

In [15]:
# New dataframe that refers to the same index as before but contains a different column
# Note that column 'id' is labeled 'pid' in the new dataframe (as is often the case, we need to reconcile common naming issues)
df2 = pd.DataFrame([[11,'A'],[12,'B'],[13,'C'],[14,'C'],[15,'C']], columns=['pid', 'part'])

print("df1:")
print(df1)

print("---")
print("df2:")
print(df2)

print("---")
print("Merged df2 into df1 using left outer join:")
df = pd.merge(df1, df2, left_on='id', right_on='pid', how='left')  #  since the names don't match we specify 'left_on' and 'right_on'
df = df.drop('pid', axis=1)  # drop the column named 'pid' since it is identical to 'id'
print(df)

df1:
   id  color
0  11    red
1  12   blue
2  13  green
---
df2:
   pid part
0   11    A
1   12    B
2   13    C
3   14    C
4   15    C
---
Merged df2 into df1 using left outer join:
   id  color part
0  11    red    A
1  12   blue    B
2  13  green    C


## Merge as above but in the opposite direction

In [16]:
print("df2:")
print(df2)
print("---")

print("df1:")
print(df1)
print("---")

print("Merged df1 into df2 using left outer join:")
df = pd.merge(df2, df1, left_on='pid', right_on='id', how='left')  
df = df.drop('id', axis=1)
print(df)

print()
print(">> Note the addition of NaN (Not a Number) values where no match up was found")

df2:
   pid part
0   11    A
1   12    B
2   13    C
3   14    C
4   15    C
---
df1:
   id  color
0  11    red
1  12   blue
2  13  green
---
Merged df1 into df2 using left outer join:
   pid part  color
0   11    A    red
1   12    B   blue
2   13    C  green
3   14    C    NaN
4   15    C    NaN

>> Note the addition of NaN (Not a Number) values where no match up was found


## Concatenate

In [19]:
print("df1:")
print(df1)

print("---")
print("df12:")
print(df12)

print("---")

df = pd.concat([df1, df12], axis=1)
print("Column-wise concat:")
print(df)

df1:
   id  color
0  11    red
1  12   blue
2  13  green
---
df12:
   id   color
0  11     red
1  12   green
2  14  yellow
3  15  yellow
4  16  yellow
---
Column-wise concat:
     id  color  id   color
0  11.0    red  11     red
1  12.0   blue  12   green
2  13.0  green  14  yellow
3   NaN    NaN  15  yellow
4   NaN    NaN  16  yellow


In [18]:
print("Row-wise concat:")
df = pd.concat([df1, df12])
print(df)

print("---")
print("Reindex the rows:")
df = df.reset_index(drop=True)
print(df)
print()
print(">> Note that the index column now runs in sequential order")

Row-wise concat:
   id   color
0  11     red
1  12    blue
2  13   green
0  11     red
1  12   green
2  14  yellow
3  15  yellow
4  16  yellow
---
Reindex the rows:
   id   color
0  11     red
1  12    blue
2  13   green
3  11     red
4  12   green
5  14  yellow
6  15  yellow
7  16  yellow

>> Note that the index column now runs in sequential order
