In [1]:
import pandas as pd
import numpy as np

array_1 = np.random.randint(1, 20, size=10)
array_2 = np.random.randint(1, 20, size=10)

# Combine dataframe or series

Sometimes need to combine different DataFrames and/or Series in non-trivial ways.

We can make use of 

- `concat(List)`
- `join()`: similar to merge()

In [2]:
s_1 = pd.Series(array_1)
s_2 = pd.Series(array_2)

# pass a list of Series or dataframe
pd.concat([s_1, s_2])

0    10
1     1
2    14
3    15
4    12
5    17
6     2
7    18
8     4
9    18
0     1
1    10
2     8
3    14
4    14
5    16
6     8
7    11
8    18
9    14
dtype: int64

In [3]:
# dataframe
a = pd.DataFrame(s_1)
b = pd.DataFrame(s_2)
pd.concat([a, b])

Unnamed: 0,0
0,10
1,1
2,14
3,15
4,12
5,17
6,2
7,18
8,4
9,18


# Join dataframe, use `merge()`

Syntax: `left_df.merge(right_df, left_on=['col'], right_on=['col'])`

default **inner join**.

In [4]:
book_store_1 = {'title':["Harry Potter", "Lord of ring", "Marvel"], 
                'price':[23, 12, 83], 
                'author':['John', 'Peter', 'Stanley']}

book_store_2 = {'booktitle':["Sherlock Holmes", "How to train your dragon", "Marvel"], 
                'publish_year':[2012, 2021, 2019], 
                'edition':[1,2,1]}

store_1 = pd.DataFrame.from_dict(book_store_1)
store_2 = pd.DataFrame.from_dict(book_store_2)
store_1

Unnamed: 0,title,price,author
0,Harry Potter,23,John
1,Lord of ring,12,Peter
2,Marvel,83,Stanley


In [5]:
store_2

Unnamed: 0,booktitle,publish_year,edition
0,Sherlock Holmes,2012,1
1,How to train your dragon,2021,2
2,Marvel,2019,1


In [6]:
store_1.merge(store_2, left_on=['title'], right_on=['booktitle'])

Unnamed: 0,title,price,author,booktitle,publish_year,edition
0,Marvel,83,Stanley,Marvel,2019,1


## merge.`how` parameter for other join

- left
- right
- outer
- inner
- cross

In [7]:
joined = store_1.merge(store_2, left_on=['title'], right_on=['booktitle'], how="outer", indicator=True)
joined

Unnamed: 0,title,price,author,booktitle,publish_year,edition,_merge
0,Harry Potter,23.0,John,,,,left_only
1,Lord of ring,12.0,Peter,,,,left_only
2,Marvel,83.0,Stanley,Marvel,2019.0,1.0,both
3,,,,Sherlock Holmes,2012.0,1.0,right_only
4,,,,How to train your dragon,2021.0,2.0,right_only


the `indicator` will provide where the rows from. example: left_only, right_only, both. It's useful for debug purpose

In [8]:
joined[joined['_merge'] != 'both']

Unnamed: 0,title,price,author,booktitle,publish_year,edition,_merge
0,Harry Potter,23.0,John,,,,left_only
1,Lord of ring,12.0,Peter,,,,left_only
3,,,,Sherlock Holmes,2012.0,1.0,right_only
4,,,,How to train your dragon,2021.0,2.0,right_only
