<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#data" data-toc-modified-id="data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>data</a></span></li><li><span><a href="#stack-&amp;-unstack" data-toc-modified-id="stack-&amp;-unstack-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>stack &amp; unstack</a></span></li><li><span><a href="#pivot-&amp;-melt" data-toc-modified-id="pivot-&amp;-melt-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>pivot &amp; melt</a></span></li><li><span><a href="#pivot-&amp;-pivot-table" data-toc-modified-id="pivot-&amp;-pivot-table-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>pivot &amp; pivot-table</a></span></li><li><span><a href="#pivot_table-vs-group_by" data-toc-modified-id="pivot_table-vs-group_by-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>pivot_table vs group_by</a></span></li></ul></div>

In [1]:
#http://www.datasciencemadesimple.com/create-pivot-table-pandas-python/
#http://www.datasciencemadesimple.com/hierarchical-indexing-multiple-indexing-python-pandas/    
#https://stackoverflow.com/questions/34702815/pandas-group-by-and-pivot-table-difference
#https://pythonhealthcare.org/2018/04/08/32-reshaping-pandas-data-with-stack-unstack-pivot-and-melt/

# data

In [4]:
import pandas as pd
import matplotlib.pyplot as plt

mtcars = pd.read_csv('~/Documents/Data/mtcars.csv',index_col='name')

In [5]:
columns = pd.MultiIndex.from_product(
    [['Semester1','Semester2'],   # Level 1
     ['Maths','Science']])        # Level 0

d = ([[12,45,67,56],
      [78,89,45,67],
      [45,67,89,90],
      [67,44,56,55]])
 
 
df = pd.DataFrame(d,
                  index=['Alisa','Bobby','Cathrine','Jack'],
                  columns=columns)
df

Unnamed: 0_level_0,Semester1,Semester1,Semester2,Semester2
Unnamed: 0_level_1,Maths,Science,Maths,Science
Alisa,12,45,67,56
Bobby,78,89,45,67
Cathrine,45,67,89,90
Jack,67,44,56,55


# stack & unstack

In [6]:
### STACK(): 
# Converts data into stacked format 
# => takes column-names and stacks them into rows
#    If there is more than one column header - specify level

### UNSTACK(): 
# Converts data into unstacked format
# => takes row-wise stacked data and turns it into columns 


In [7]:
## Turns lower - level 0 - colum-(names) and stacks the values
dfs_0 = df.stack(0); 
dfs_0

Unnamed: 0,Unnamed: 1,Maths,Science
Alisa,Semester1,12,45
Alisa,Semester2,67,56
Bobby,Semester1,78,89
Bobby,Semester2,45,67
Cathrine,Semester1,45,67
Cathrine,Semester2,89,90
Jack,Semester1,67,44
Jack,Semester2,56,55


In [8]:
## Turn upper = level 1 - colum-(names) and stacks the values
dfs_1 = df.stack(1); 
dfs_1

Unnamed: 0,Unnamed: 1,Semester1,Semester2
Alisa,Maths,12,67
Alisa,Science,45,56
Bobby,Maths,78,45
Bobby,Science,89,67
Cathrine,Maths,45,89
Cathrine,Science,67,90
Jack,Maths,67,56
Jack,Science,44,55


In [9]:
## Take first row-names and turns the values into columns
#   => back to start
df_unstacked = dfs_1.unstack(1) 
df_unstacked

Unnamed: 0_level_0,Semester1,Semester1,Semester2,Semester2
Unnamed: 0_level_1,Maths,Science,Maths,Science
Alisa,12,45,67,56
Bobby,78,89,45,67
Cathrine,45,67,89,90
Jack,67,44,56,55


In [10]:
dfs_1.unstack().unstack() # formatierung, weil nun series statt dataframe

Semester1  Maths    Alisa       12
                    Bobby       78
                    Cathrine    45
                    Jack        67
           Science  Alisa       45
                    Bobby       89
                    Cathrine    67
                    Jack        44
Semester2  Maths    Alisa       67
                    Bobby       45
                    Cathrine    89
                    Jack        56
           Science  Alisa       56
                    Bobby       67
                    Cathrine    90
                    Jack        55
dtype: int64

# pivot & melt

In [11]:
## Stack & Unstack => work on indexed table
## Melt  & Pivot   => work on  normal table
df_wide = df.stack().reset_index()
df_wide.columns = ['Name','Subject','Semester1','Semester2']
df_wide

Unnamed: 0,Name,Subject,Semester1,Semester2
0,Alisa,Maths,12,67
1,Alisa,Science,45,56
2,Bobby,Maths,78,45
3,Bobby,Science,89,67
4,Cathrine,Maths,45,89
5,Cathrine,Science,67,90
6,Jack,Maths,67,56
7,Jack,Science,44,55


In [12]:
## Id-vars + varialbe to melt
# = variables that combinedly uniquely identify observation associated with value
df_long = df_wide.melt(id_vars=['Name','Subject'], 
                       var_name='Semester', value_name='Points')
df_long

Unnamed: 0,Name,Subject,Semester,Points
0,Alisa,Maths,Semester1,12
1,Alisa,Science,Semester1,45
2,Bobby,Maths,Semester1,78
3,Bobby,Science,Semester1,89
4,Cathrine,Maths,Semester1,45
5,Cathrine,Science,Semester1,67
6,Jack,Maths,Semester1,67
7,Jack,Science,Semester1,44
8,Alisa,Maths,Semester2,67
9,Alisa,Science,Semester2,56


In [13]:
#df_long_pivoted = df_long.pivot(index=['Name','Subject'],
#                                columns='Semester',
#                                values='Points')
## => cannot pass array as index

df_long_pivoted = df_long.pivot_table(
    index=['Name','Subject'],
    columns='Semester',
    values='Points')

df_long_pivoted


Unnamed: 0_level_0,Semester,Semester1,Semester2
Name,Subject,Unnamed: 2_level_1,Unnamed: 3_level_1
Alisa,Maths,12,67
Alisa,Science,45,56
Bobby,Maths,78,45
Bobby,Science,89,67
Cathrine,Maths,45,89
Cathrine,Science,67,90
Jack,Maths,67,56
Jack,Science,44,55


In [14]:
# multi-index zugreifen
df_long_pivoted.loc[('Alisa','Maths')]

Semester
Semester1    12
Semester2    67
Name: (Alisa, Maths), dtype: int64

# pivot & pivot-table

In [25]:
# pivot() NEEDS TO have UNIQUE pivot pair
# => EACH PAIR must ONLY EXIST ONCE!!!!
# Otherwise aggregation is neeeded to handle that case 
# => pivot_table(...,aggfunc='mean')

In [26]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two','two','two'],
                    'bar': ['A', 'B', 'C', 'A', 'B', 'C','C'],
                    'baz': [1, 2, 3, 4, 5, 5, 6]})

# (two,c): is contained twice and 
# (two,c) => 5 and (two,c) => 6   (mapping to same value is also not possible)
df

Unnamed: 0,foo,bar,baz
0,one,A,1
1,one,B,2
2,one,C,3
3,two,A,4
4,two,B,5
5,two,C,5
6,two,C,6


In [27]:
#df.pivot(index='foo',columns='bar',values='baz')
# => DOES NOT WORK!!!!! 
df.iloc[:-1].pivot(index='foo',columns='bar',values='baz')
# => works because double pair is removed

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,5


In [18]:
df.pivot_table(index='foo',columns='bar',values='baz',aggfunc='mean')
# => pivot_table explictly handles multiple same pairs by aggregation

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1.0,2.0,3.0
two,4.0,5.0,5.5


In [19]:
#mtcars.pivot(index='cyl',columns='am',values='mpg')
# => geht nicht, weil einträge doppelt vorkommen 
# => dann muss aggregiert werden
mtcars.pivot_table(index='cyl',columns='am',values='hp',aggfunc='sum')

am,0,1
cyl,Unnamed: 1_level_1,Unnamed: 2_level_1
4,254,655
6,461,395
8,2330,599


# pivot_table vs group_by

In [20]:
df['cyl'] = df.index
df2 = pd.melt(df,id_vars='cyl',var_name='am', value_name='values')
df2

Unnamed: 0,cyl,am,values
0,0,foo,one
1,1,foo,one
2,2,foo,one
3,3,foo,two
4,4,foo,two
5,5,foo,two
6,6,foo,two
7,0,bar,A
8,1,bar,B
9,2,bar,C


Dasselbe mit group_by() + reshape

In [21]:
df = mtcars.groupby(['cyl','am'])['hp'].mean()
df

cyl  am
4    0      84.666667
     1      81.875000
6    0     115.250000
     1     131.666667
8    0     194.166667
     1     299.500000
Name: hp, dtype: float64

In [22]:
mtcars.groupby(['cyl','am'])['hp'].mean().unstack(level=1)

am,0,1
cyl,Unnamed: 1_level_1,Unnamed: 2_level_1
4,84.666667,81.875
6,115.25,131.666667
8,194.166667,299.5


In [23]:
mtcars.pivot_table(index='cyl',columns='am',values='hp',aggfunc='mean')

am,0,1
cyl,Unnamed: 1_level_1,Unnamed: 2_level_1
4,84.666667,81.875
6,115.25,131.666667
8,194.166667,299.5


In [24]:

mtcars.pivot_table(index=['cyl','car_type'], columns='am',values='hp',aggfunc='mean')

Unnamed: 0_level_0,am,0,1
cyl,car_type,Unnamed: 2_level_1,Unnamed: 3_level_1
4,suv,84.666667,81.875
6,smart,117.0,175.0
6,suv,110.0,110.0
8,smart,194.166667,299.5
