## + / .add

In [2]:
d1 =     pd.DataFrame({'x': [1, 2, 3, 4], 'y': [4, 5, 6, 7]})
d2 =     pd.DataFrame({'x': [10, 20, 30], 'y': [44, None, 66]})
d2_b =   pd.DataFrame({'x': [10, 20, 30], 'z': [44, None, 66]})
d2_c =   pd.DataFrame({'x': [10, 20, 30], 'y': [44, None, 66]}, index=[0, 2, 4])

In [16]:
d1 + d2  # x + NaN == NaN
d1 + d2_b
d1 + d2_c
d1 + d2.reindex(d1.index).fillna(0)
d1 + d2_c

Unnamed: 0,x,y
0,11.0,48.0
1,,
2,23.0,
3,,
4,,


In [37]:
d4

Unnamed: 0,x,y
0,10,44.0
1,20,
2,30,66.0


In [38]:
d5

Unnamed: 0,x,y
0,10,44.0
2,20,
4,30,66.0


In [1]:
beers = pd.read_csv('https://fspot.org/beers.csv')

In [7]:
ranking = pd.DataFrame({
    'style': beers['style'].unique(),
    'rank': range(1, 1+len(beers['style'].unique()))
})

In [2]:
dfh = beers.head()

## MERGE

In [9]:
dfh.merge(ranking).head(1)  # all implicit

Unnamed: 0,alcohol,gram_for_one_euro,name,price,style,volume,rank
0,0.054,4.877377,Ninkasi Ploploplop,2.89,India Pale Ale,330.0,1


In [72]:
# explicit on:
dfh.rename(columns={'style': 'kind'})\
   .merge(ranking, left_on='kind', right_on='style').head(1)

Unnamed: 0,alcohol,gram_for_one_euro,name,price,kind,volume,rank,style
0,0.054,4.877377,Ninkasi Ploploplop,2.89,India Pale Ale,330.0,1,India Pale Ale


In [65]:
# on left_index / right column:
dfh.set_index('style')\
   .merge(ranking, left_index=True, right_on='style').head(1)

Unnamed: 0,alcohol,gram_for_one_euro,name,price,volume,rank,style
0,0.054,4.877377,Ninkasi Ploploplop,2.89,330.0,1,India Pale Ale


In [68]:
# on left_index / right index:
dfh.set_index('style')\
   .merge(ranking.set_index('style'), left_index=True, right_index=True).head(1)

Unnamed: 0_level_0,alcohol,gram_for_one_euro,name,price,volume,rank
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
India Pale Ale,0.054,4.877377,Ninkasi Ploploplop,2.89,330.0,1


but for this last case you would prefer...

## JOIN

In [71]:
#dfh.join(ranking.set_index('style'), on='style') --- equivalent
dfh.set_index('style')\
   .join(ranking.set_index('style')).head(1)

Unnamed: 0_level_0,alcohol,gram_for_one_euro,name,price,volume,rank
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
India Pale Ale,0.054,4.877377,Ninkasi Ploploplop,2.89,330.0,1


#### "how"

In [12]:
d1 = pd.DataFrame({'name': ['a', 'b', 'c'], 'value': [1, 2, 3]})
d2 = pd.DataFrame({'name': ['a', 'c', 'e'], 'label': ['nice', 'meh', 'nice']})

In [13]:
d1.merge(d2)  # default is "inner" → no NaN

Unnamed: 0,name,value,label
0,a,1,nice
1,c,3,meh


In [14]:
d1.merge(d2, how='left')

Unnamed: 0,name,value,label
0,a,1,nice
1,b,2,
2,c,3,meh


## CONCAT

In [26]:
pd.concat([d1, d2], sort=False)

Unnamed: 0,name,value,label
0,a,1.0,
1,b,2.0,
2,c,3.0,
0,a,,nice
1,c,,meh
2,e,,nice


In [27]:
pd.concat([d1, d2], sort=False, ignore_index=True)  # ignore_index → .reset_index(drop=True)

Unnamed: 0,name,value,label
0,a,1.0,
1,b,2.0,
2,c,3.0,
3,a,,nice
4,c,,meh
5,e,,nice


In [91]:
pd.concat([d1, d2], sort=False, axis=1)

Unnamed: 0,name,value,name.1,label
0,a,1,a,nice
1,b,2,c,meh
2,c,3,e,nice


# Tidy data

Extract from https://vita.had.co.nz/papers/tidy-data.pdf:

![](http://i.imgur.com/IvGvXFI.png)

### Reshaping / Pivoting

In [94]:
d1.T

Unnamed: 0,0,1,2
name,a,b,c
value,1,2,3


In [97]:
d1.stack().unstack(0)

Unnamed: 0,0,1,2
name,a,b,c
value,1,2,3


In [106]:
d3 = pd.DataFrame({
    'date': list(range(7)) + list(range(7)),
    'val': range(100, 114),
    'name': ['alice'] * 7 + ['bob'] * 7,
})

Long to wide:

![](https://pandas.pydata.org/pandas-docs/stable/_images/reshaping_pivot.png)

In [141]:
d3.pivot('date', 'name', 'val')

name,alice,bob
date,Unnamed: 1_level_1,Unnamed: 2_level_1
0,100,107
1,101,108
2,102,109
3,103,110
4,104,111
5,105,112
6,106,113


In [146]:
# Same (see https://stackoverflow.com/a/42965471/8072009 for diff):
d3.pivot_table('val', 'date', 'name')

name,alice,bob
date,Unnamed: 1_level_1,Unnamed: 2_level_1
0,100,107
1,101,108
2,102,109
3,103,110
4,104,111
5,105,112
6,106,113


In [143]:
# Same:
d3.set_index(['date', 'name']).unstack()

Unnamed: 0_level_0,val,val
name,alice,bob
date,Unnamed: 1_level_2,Unnamed: 2_level_2
0,100,107
1,101,108
2,102,109
3,103,110
4,104,111
5,105,112
6,106,113


In [147]:
d3.pivot_table('val', 'date', 'name')\
   .reset_index()\
   .melt(id_vars=['date'], value_vars=['alice', 'bob'])

Unnamed: 0,date,name,value
0,0,alice,100
1,1,alice,101
2,2,alice,102
3,3,alice,103
4,4,alice,104
5,5,alice,105
6,6,alice,106
7,0,bob,107
8,1,bob,108
9,2,bob,109


# GroupBy

Split / Apply / Combine:

![](https://i.imgur.com/hg5DYmU.png)

In [1]:
df = pd.DataFrame({'key': list('ABCABCABC'), 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]})

#### Aggregation

In [9]:
df.groupby(df['key']).sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,15
B,30
C,45


In [18]:
df.groupby('key').agg(['sum', 'min', 'max'])

Unnamed: 0_level_0,data,data,data
Unnamed: 0_level_1,sum,min,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,15,0,10
B,30,5,15
C,45,10,20


In [20]:
df.groupby('key').transform(np.mean)

Unnamed: 0,data
0,5
1,10
2,15
3,5
4,10
5,15
6,5
7,10
8,15


- peak_to_peak
- rank/share(pct)/cumsum within group
- calcul d'évolution: merge /