In [1]:
import pandas as pd
import numpy as np

## A few widely-used functions
1. `crosstab`, `pivot_table`
2. `groupby`, `stack`, `unstack`, `reset_index`, `set_index`
3. `melt`, `join`, `concat`

In [3]:
samples = np.array(['sample1','sample2','sample3','sample4','sample5','sample6','sample3','sample1'])
mutations = np.array(['mutation1','mutation1','mutation1','mutation2','mutation2','mutation3','mutation3','mutation1'])
yes = np.array([1,1,1,4,1,1,1,5])
df = pd.DataFrame({'samples':samples,'mutations':mutations,'yes':yes})
df

Unnamed: 0,samples,mutations,yes
0,sample1,mutation1,1
1,sample2,mutation1,1
2,sample3,mutation1,1
3,sample4,mutation2,4
4,sample5,mutation2,1
5,sample6,mutation3,1
6,sample3,mutation3,1
7,sample1,mutation1,5


In [4]:
# crosstab, pivot_table (a more general form of pivot function)
print(pd.crosstab(index=df['samples'],columns=df['mutations'],values=df['yes'],aggfunc=np.sum))
print(pd.pivot_table(data=df,index='samples',columns='mutations',values='yes',aggfunc=np.sum,fill_value=0))

mutations  mutation1  mutation2  mutation3
samples                                   
sample1          6.0        NaN        NaN
sample2          1.0        NaN        NaN
sample3          1.0        NaN        1.0
sample4          NaN        4.0        NaN
sample5          NaN        1.0        NaN
sample6          NaN        NaN        1.0
mutations  mutation1  mutation2  mutation3
samples                                   
sample1            6          0          0
sample2            1          0          0
sample3            1          0          1
sample4            0          4          0
sample5            0          1          0
sample6            0          0          1


In [5]:
# groupby, unstack, stack
df.groupby(['samples','mutations'])['yes'].sum().unstack(fill_value=0)

mutations,mutation1,mutation2,mutation3
samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sample1,6,0,0
sample2,1,0,0
sample3,1,0,1
sample4,0,4,0
sample5,0,1,0
sample6,0,0,1


In [16]:
# melt to go back to long-format
wide = df.groupby(['samples','mutations'])['yes'].sum().unstack(fill_value=0)
wide = wide.reset_index()
long = wide.melt(id_vars='samples',value_vars=['mutation1','mutation2','mutation3'],var_name='var',value_name='values')
long

Unnamed: 0,samples,var,values
0,sample1,mutation1,6
1,sample2,mutation1,1
2,sample3,mutation1,1
3,sample4,mutation1,0
4,sample5,mutation1,0
5,sample6,mutation1,0
6,sample1,mutation2,0
7,sample2,mutation2,0
8,sample3,mutation2,0
9,sample4,mutation2,4


In [17]:
# join and concat can be used interchangeably
df = pd.DataFrame({'col1':[1,2,3],'col2':[5,6,7],'col3':[9,10,11]})
# I want to insert a row between 1 and 2
row = pd.DataFrame({'col1':[4],'col2':[0],'col3':1})
pd.concat([df.iloc[0:2],row,df.iloc[2:]],axis=0)

Unnamed: 0,col1,col2,col3
0,1,5,9
1,2,6,10
0,4,0,1
2,3,7,11
