In [1]:
import numpy as np
import pandas as pd

pd.set_option('precision', 2)

In [2]:
df1 = pd.DataFrame({
    'ticker': ['AAPL', 'MSFT', 'IBM', 'YHOO', 'GOOG'],
    'open': [426.23, 42.30, 101.65, 35.53, 200.41]
})
df1

Unnamed: 0,ticker,open
0,AAPL,426.23
1,MSFT,42.3
2,IBM,101.65
3,YHOO,35.53
4,GOOG,200.41


In [3]:
df2 = pd.DataFrame({
    'ticker': ['AAPL', 'GOOG', 'NFLX'],
    'close': [427.53, 210.96, 91.86]
}, columns=['ticker', 'close'])
df2

Unnamed: 0,ticker,close
0,AAPL,427.53
1,GOOG,210.96
2,NFLX,91.86


## Concatenation/Binding

* Add rows\* (`append`)
* Add columns (`pd.concat`)
* Add rows and columns (`pd.concat` or `append`)

\* `append` will also add columns!

### Data for Concatenation

We're going to make this data explicitly **daily** so we're going to 
add a `date` column. Take the first two records only so the data
fits on the slide.

In [4]:
df3 = df1.assign(date=pd.datetime(2018, 1, 4))\
    .iloc[:2, ] # first 2 rows only
df3

Unnamed: 0,ticker,open,date
0,AAPL,426.23,2018-01-04
1,MSFT,42.3,2018-01-04


In [5]:
df4 = df3.assign(
    date=pd.datetime(2018, 1, 5),
    open=lambda x: x.open + 10
)
df4

Unnamed: 0,ticker,open,date
0,AAPL,436.23,2018-01-05
1,MSFT,52.3,2018-01-05


### Adding rows

In [6]:
df3.append(df4)

Unnamed: 0,ticker,open,date
0,AAPL,426.23,2018-01-04
1,MSFT,42.3,2018-01-04
0,AAPL,436.23,2018-01-05
1,MSFT,52.3,2018-01-05


Notice how the index is repeated and duplicated for the default `pd.RangeIndex`

### No dups please

To check for duplicated index values:

In [7]:
try:
    df3.append(df4, verify_integrity=True)
except ValueError as e:
    print(e)

Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


### Ignore the index

`ignore_index` discards the indexes from the bound data frames

In [8]:
df3.append(df4, ignore_index=True)

Unnamed: 0,ticker,open,date
0,AAPL,426.23,2018-01-04
1,MSFT,42.3,2018-01-04
2,AAPL,436.23,2018-01-05
3,MSFT,52.3,2018-01-05


We usually don't need to validate the index when we pass `ignore_index`
because we're creating a new index!

### Rows and Columns with `append`

* `append` does an outer join on both rows and columns
* We'll see how to avoid this with `concat`

In [9]:
df3a = df3.assign(close=lambda x: (x.open + 9))
df3a

Unnamed: 0,ticker,open,date,close
0,AAPL,426.23,2018-01-04,435.23
1,MSFT,42.3,2018-01-04,51.3


In [10]:
df4

Unnamed: 0,ticker,open,date
0,AAPL,436.23,2018-01-05
1,MSFT,52.3,2018-01-05


### We've been warned

In [11]:
df3a.append(df4, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,close,date,open,ticker
0,435.23,2018-01-04,426.23,AAPL
1,51.3,2018-01-04,42.3,MSFT
2,,2018-01-05,436.23,AAPL
3,,2018-01-05,52.3,MSFT


### Probably the most common `append`

In [12]:
df3a.append(df4, ignore_index=True, sort=False)

Unnamed: 0,ticker,open,date,close
0,AAPL,426.23,2018-01-04,435.23
1,MSFT,42.3,2018-01-04,51.3
2,AAPL,436.23,2018-01-05,
3,MSFT,52.3,2018-01-05,


Anytime you're repeating code, put it in a function and stay `DRY` (Don't repeat yourself)

In [13]:
def pwd_append(x: pd.DataFrame, y: pd.DataFrame, **kwargs) -> pd.DataFrame:
    defaults = {
        'ignore_index': True,
        'sort': True,
        'verify_integrity': False 
    }
    defaults.update(kwargs)
    return x.append(y, **defaults)

## General-purpose `pd.concat`

* **Join** and bind across rows or columns
* Pass 1 or more `Series` or `DataFrame`s

### Replicate `append`

* IMPORTANT: `ignore_index` only applies to the axis of concatenation
  which can be rows or columns

In [14]:
pd.concat([df3, df4], ignore_index=True)

Unnamed: 0,ticker,open,date
0,AAPL,426.23,2018-01-04
1,MSFT,42.3,2018-01-04
2,AAPL,436.23,2018-01-05
3,MSFT,52.3,2018-01-05


### Replicate `append` (cont)

Outer join of both rows and columns like `append`

In [15]:
pd.concat([df3a, df4], ignore_index=True, sort=False)

Unnamed: 0,ticker,open,date,close
0,AAPL,426.23,2018-01-04,435.23
1,MSFT,42.3,2018-01-04,51.3
2,AAPL,436.23,2018-01-05,
3,MSFT,52.3,2018-01-05,


## Stuff you can't do with `append`

### Bind columns only

In [16]:
df5 = pd.DataFrame({'a': [1, 2]})
df6 = pd.DataFrame({'b': [3, 4]})

In [17]:
df5

Unnamed: 0,a
0,1
1,2


In [18]:
df6

Unnamed: 0,b
0,3
1,4


In [20]:
pd.concat([df5, df6, df6], axis=1)

Unnamed: 0,a,b,b.1
0,1,3,3
1,2,4,4


### `concat`  binds rows and columns

* Always performs an outer join on the concatenation axis

In [21]:
df6a = df6.set_index(pd.Index([6, 7]))
pd.concat([df5, df6a], sort=False)

Unnamed: 0,a,b
0,1.0,
1,2.0,
6,,3.0
7,,4.0


In [22]:
df5

Unnamed: 0,a
0,1
1,2


In [23]:
df6a

Unnamed: 0,b
6,3
7,4


### Specify behavior of non-concatenation axis

* The `join` parameter only applies to the non-concatenation axis
* Set to `inner` to only get the common index elements or columns

In [26]:
pd.concat([df3a, df4], ignore_index=True, sort=False, join='inner')

Unnamed: 0,ticker,open,date
0,AAPL,426.23,2018-01-04
1,MSFT,42.3,2018-01-04
2,AAPL,436.23,2018-01-05
3,MSFT,52.3,2018-01-05


In [24]:
df3a

Unnamed: 0,ticker,open,date,close
0,AAPL,426.23,2018-01-04,435.23
1,MSFT,42.3,2018-01-04,51.3


In [25]:
df4

Unnamed: 0,ticker,open,date
0,AAPL,436.23,2018-01-05
1,MSFT,52.3,2018-01-05


Notice there is no `close` column because it's not present in both
data frames

### Identify the source Series/DataFrame with `keys`

In [27]:
pd.concat([df3, df4], keys=['df3', 'df4'])

Unnamed: 0,Unnamed: 1,ticker,open,date
df3,0,AAPL,426.23,2018-01-04
df3,1,MSFT,42.3,2018-01-04
df4,0,AAPL,436.23,2018-01-05
df4,1,MSFT,52.3,2018-01-05


### Use `keys` and `names`

In [29]:
pd.concat([df3, df4], keys=['df3', 'df4'], names=['source', 'row_num']).reset_index()

Unnamed: 0,source,row_num,ticker,open,date
0,df3,0,AAPL,426.23,2018-01-04
1,df3,1,MSFT,42.3,2018-01-04
2,df4,0,AAPL,436.23,2018-01-05
3,df4,1,MSFT,52.3,2018-01-05
