# Pivoting DataFrames

In [1]:
import pandas as pd

In [2]:
trials = pd.read_csv('trials_01.csv')

### Clinical trials data

In [3]:
trials

Unnamed: 0,id,treatment,gender,response
0,1,A,F,5
1,2,A,M,3
2,3,B,F,8
3,4,B,M,9


### Reshaping by pivoting

In [4]:
# use the 2nd df.pivot. Dont know why!!!
trials.pivot(index='treatment',
            columns= 'gender',
            values= 'response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,3
B,8,9


In [5]:
trials.pivot(index='treatment', columns='gender')

Unnamed: 0_level_0,id,id,response,response
gender,F,M,F,M
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,1,2,5,3
B,3,4,8,9


In [6]:
users = pd.read_csv('visitors.csv')
users

Unnamed: 0,weekday,city,visitors,signups
0,Sun,Austin,139,7
1,Sun,Dallas,237,12
2,Mon,Austin,326,3
3,Mon,Dallas,456,5


In [7]:
## Pivoting a single variable

#Pivot the users DataFrame with the rows indexed by 'weekday', 
#the columns indexed by 'city', and the values populated with 'visitors'.


# Pivot the users DataFrame: visitors_pivot
visitors_pivot = users.pivot(index='weekday',
                            columns='city',
                            values='visitors')

# Print the pivoted DataFrame
print(visitors_pivot)


city     Austin  Dallas
weekday                
Mon         326     456
Sun         139     237


In [8]:
users.pivot(index='weekday',columns='city')

Unnamed: 0_level_0,visitors,visitors,signups,signups
city,Austin,Dallas,Austin,Dallas
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Mon,326,456,3,5
Sun,139,237,7,12


In [9]:
users.pivot(index='weekday',columns='city', values='signups')

city,Austin,Dallas
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,3,5
Sun,7,12


In [10]:
# Pivot users with signups indexed by weekday and city: signups_pivot
signups_pivot = users.pivot(index='weekday',columns='city', values='signups')

# Print signups_pivot
print(signups_pivot)

# Pivot users pivoted by both signups and visitors: pivot
pivot = users.pivot(index='weekday', columns='city')

# Print the pivoted DataFrame
print(pivot)


city     Austin  Dallas
weekday                
Mon           3       5
Sun           7      12
        visitors        signups       
city      Austin Dallas  Austin Dallas
weekday                               
Mon          326    456       3      5
Sun          139    237       7     12


# Stacking  and unstacking Dataframes

### Creating a nulti-level index
- input : list of column labels to use as tuples in the multi=level index, ('treatment','gender')

Pivot wont work bc of the multi-level index

In [11]:
trials

Unnamed: 0,id,treatment,gender,response
0,1,A,F,5
1,2,A,M,3
2,3,B,F,8
3,4,B,M,9


In [12]:
trials = trials.set_index(['treatment','gender'])
trials

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
treatment,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
A,F,1,5
A,M,2,3
B,F,3,8
B,M,4,9


### Unstacking a multi-index (1)
Similar to pivot method

- move some of the index lebvels to columns, making our DF shorter and wider(more cols, few rows)

ex: move the second level of the index, (gender) to the columns using `.unstack`

- now we have hierarchical cols

In [13]:
trials

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
treatment,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
A,F,1,5
A,M,2,3
B,F,3,8
B,M,4,9


In [14]:
trials.unstack(level='gender')

Unnamed: 0_level_0,id,id,response,response
gender,F,M,F,M
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,1,2,5,3
B,3,4,8,9


### Unstacking a multi-index (2)

same result, diffrent way

In [15]:
trials

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
treatment,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
A,F,1,5
A,M,2,3
B,F,3,8
B,M,4,9


In [16]:
trials.unstack(level=1)

Unnamed: 0_level_0,id,id,response,response
gender,F,M,F,M
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,1,2,5,3
B,3,4,8,9


### Stacking DataFrames


In [17]:
trials_by_gender = trials.unstack(level='gender')
trials_by_gender

Unnamed: 0_level_0,id,id,response,response
gender,F,M,F,M
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,1,2,5,3
B,3,4,8,9


In [18]:
trials_by_gender.stack(level='gender')

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
treatment,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
A,F,1,5
A,M,2,3
B,F,3,8
B,M,4,9


In [19]:
### Stacking DF


In [20]:
stacked = trials_by_gender.stack(level='gender')
stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
treatment,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
A,F,1,5
A,M,2,3
B,F,3,8
B,M,4,9


### Swapping levels

In [21]:
swapped = stacked.swaplevel(0,1)
swapped

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
gender,treatment,Unnamed: 2_level_1,Unnamed: 3_level_1
F,A,1,5
M,A,2,3
F,B,3,8
M,B,4,9


### Sorting rows

In [22]:
sorted_trials = swapped.sort_index()
sorted_trials

Unnamed: 0_level_0,Unnamed: 1_level_0,id,response
gender,treatment,Unnamed: 2_level_1,Unnamed: 3_level_1
F,A,1,5
F,B,3,8
M,A,2,3
M,B,4,9


In [23]:
# Stacking & unstacking I
users = pd.read_csv('users.csv')


users = users.set_index(['city','weekday']).sort_index()  # replicate from code
users

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,visitors,signups
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austin,Mon,2,326,3
Austin,Sun,0,139,7
Dallas,Mon,3,456,5
Dallas,Sun,1,237,12


In [24]:
users.columns

Index(['Unnamed: 0', 'visitors', 'signups'], dtype='object')

In [25]:
# Unstack users by 'weekday': byweekday
byweekday = users.unstack(level='weekday')

# Print the byweekday DataFrame
print(byweekday)

# Stack byweekday by 'weekday' and print it
print(byweekday.stack(level='weekday'))

        Unnamed: 0     visitors      signups    
weekday        Mon Sun      Mon  Sun     Mon Sun
city                                            
Austin           2   0      326  139       3   7
Dallas           3   1      456  237       5  12
                Unnamed: 0  visitors  signups
city   weekday                               
Austin Mon               2       326        3
       Sun               0       139        7
Dallas Mon               3       456        5
       Sun               1       237       12


In [26]:
# Stacking & unstacking II

# Unstack users by 'city': bycity
bycity = users.unstack(level='city')

# Print the bycity DataFrame
print(bycity)

# Stack bycity by 'city' and print it
print(bycity.stack(level='city'))


        Unnamed: 0        visitors        signups       
city        Austin Dallas   Austin Dallas  Austin Dallas
weekday                                                 
Mon              2      3      326    456       3      5
Sun              0      1      139    237       7     12
                Unnamed: 0  visitors  signups
weekday city                                 
Mon     Austin           2       326        3
        Dallas           3       456        5
Sun     Austin           0       139        7
        Dallas           1       237       12


In [27]:
## Restoring the index order
# The goal here is to convert bycity back to something that looks like users.


# Stack 'city' back into the index of bycity: newusers
newusers = bycity.stack(level='city')

# Swap the levels of the index of newusers: newusers
newusers = newusers.swaplevel(0,1)

# Print newusers and verify that the index is not sorted
print(newusers)

# Sort the index of newusers: newusers
newusers = newusers.sort_index()

# Print newusers and verify that the index is now sorted
print(newusers)

# Verify that the new DataFrame is equal to the original
print(newusers.equals(users))

                Unnamed: 0  visitors  signups
city   weekday                               
Austin Mon               2       326        3
Dallas Mon               3       456        5
Austin Sun               0       139        7
Dallas Sun               1       237       12
                Unnamed: 0  visitors  signups
city   weekday                               
Austin Mon               2       326        3
       Sun               0       139        7
Dallas Mon               3       456        5
       Sun               1       237       12
True


# Melting DataFrames

### Clinical trials data


In [28]:
trials = pd.read_csv('trials_01.csv')
trials

Unnamed: 0,id,treatment,gender,response
0,1,A,F,5
1,2,A,M,3
2,3,B,F,8
3,4,B,M,9


### Clinical trials after pivoting

In [29]:
trials.pivot(index='treatment', columns='gender', values='response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,3
B,8,9


In [30]:
new_trials = pd.read_csv('trials_02.csv')
new_trials

Unnamed: 0,treatment,F,M
0,A,5,3
1,B,8,9


### Melting DataFrame

In [31]:
pd.melt(new_trials)

Unnamed: 0,variable,value
0,treatment,A
1,treatment,B
2,F,5
3,F,8
4,M,3
5,M,9


### Specifying id_vars

In [32]:
pd.melt(new_trials, id_vars=['treatment'])

Unnamed: 0,treatment,variable,value
0,A,F,5
1,B,F,8
2,A,M,3
3,B,M,9


## specifying `value_vars`

In [33]:
pd.melt(new_trials, id_vars=['treatment'], value_vars=['F','M'])

Unnamed: 0,treatment,variable,value
0,A,F,5
1,B,F,8
2,A,M,3
3,B,M,9


## specifying `value_name`

In [34]:
pd.melt(new_trials, id_vars=['treatment'],
       var_name='gender', value_name='response')

Unnamed: 0,treatment,gender,response
0,A,F,5
1,B,F,8
2,A,M,3
3,B,M,9


In [35]:
trials

Unnamed: 0,id,treatment,gender,response
0,1,A,F,5
1,2,A,M,3
2,3,B,F,8
3,4,B,M,9


---
## Lets practice

In [36]:
users = pd.read_csv('visitors.csv')
users = users[['weekday', 'city','visitors']]
visitors_by_city_weekday = users.pivot(index='weekday', columns='city')
visitors_by_city_weekday

Unnamed: 0_level_0,visitors,visitors
city,Austin,Dallas
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2
Mon,326,456
Sun,139,237


In [37]:
# Adding names for readability

# Reset the index: visitors_by_city_weekday
visitors_by_city_weekday = visitors_by_city_weekday.reset_index() 

# Print visitors_by_city_weekday
print(visitors_by_city_weekday)

# Melt visitors_by_city_weekday: visitors
visitors = pd.melt(visitors_by_city_weekday, id_vars=['weekday'], value_name='visitors')

# Print visitors
print(visitors)

     weekday visitors       
city           Austin Dallas
0        Mon      326    456
1        Sun      139    237
  weekday       NaN    city  visitors
0     Mon  visitors  Austin       326
1     Sun  visitors  Austin       139
2     Mon  visitors  Dallas       456
3     Sun  visitors  Dallas       237


In [38]:
# Going from wide to long
users = pd.read_csv('visitors.csv')
users

Unnamed: 0,weekday,city,visitors,signups
0,Sun,Austin,139,7
1,Sun,Dallas,237,12
2,Mon,Austin,326,3
3,Mon,Dallas,456,5


In [39]:
# Melt users: skinny
skinny = pd.melt(users,id_vars=['weekday','city'],value_vars=['visitors','signups'])
# Print skinny
print(skinny)

  weekday    city  variable  value
0     Sun  Austin  visitors    139
1     Sun  Dallas  visitors    237
2     Mon  Austin  visitors    326
3     Mon  Dallas  visitors    456
4     Sun  Austin   signups      7
5     Sun  Dallas   signups     12
6     Mon  Austin   signups      3
7     Mon  Dallas   signups      5


In [40]:
# Obtaining key-value pairs with melt()

# Set the new index: users_idx
users_idx = users.set_index(['city','weekday'])

# Print the users_idx DataFrame
print(users_idx)

# Obtain the key-value pairs: kv_pairs
kv_pairs = pd.melt(users_idx, col_level=0)

# Print the key-value pairs
print(kv_pairs)


                visitors  signups
city   weekday                   
Austin Sun           139        7
Dallas Sun           237       12
Austin Mon           326        3
Dallas Mon           456        5
   variable  value
0  visitors    139
1  visitors    237
2  visitors    326
3  visitors    456
4   signups      7
5   signups     12
6   signups      3
7   signups      5


# Pivot tables
## more clinical trials data

In [41]:
more_trials = pd.read_csv('trials_03.csv')
more_trials

Unnamed: 0,id,treatment,gender,response
0,1,A,F,5
1,2,A,M,3
2,3,A,M,8
3,4,A,F,9
4,5,B,F,1
5,6,B,M,8
6,7,B,F,4
7,8,B,F,6


### Rearranging by pivoting
- repeat pairs make it impossible
- requires unique index/column pairs to identifyvalues in teh new table

```python
more_trials.pivot(index='treatment',
                  columns='gender', 
                  values='response')
----------------------------------------------------------------------
ValueError: Index contains duplicate entries, cannot reshape
    
```

### Pivot tables
- reshapes a dataframe much like a pivot
- summarizing the datafram ewith a pair of summarizing variables and their values
- deal with multiple values for the same index/column pair using a reduction
- by default, the reductions is an `average`

In [42]:
more_trials.pivot_table(index='treatment',
                  columns='gender', 
                  values='response')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,7.0,5.5
B,3.666667,8.0


### other aggregations

`aggfunc='count'`: count the values that are rpesent for each tratment/gender combination


In [43]:
# outcome is a frequency table!

more_trials.pivot_table(index='treatment',
                  columns='gender', 
                  values='response',
                       aggfunc='count')

gender,F,M
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2,2
B,3,1


---
# Let's practice!

In [44]:
# Setting up a pivot table

# Create the DataFrame with the appropriate pivot table: by_city_day
by_city_day = users.pivot_table(index='weekday',
columns='city')

# Print by_city_day
print(by_city_day)

        signups        visitors       
city     Austin Dallas   Austin Dallas
weekday                               
Mon           3      5      326    456
Sun           7     12      139    237


In [45]:
# Using other aggregations in pivot tables
# Use a pivot table to display the count of each column: count_by_weekday1
count_by_weekday1 = users.pivot_table(index='weekday',
                                    aggfunc='count')

# Print count_by_weekday
print(count_by_weekday1)

# Replace 'aggfunc='count'' with 'aggfunc=len': count_by_weekday2
count_by_weekday2 = users.pivot_table(index='weekday',aggfunc=len)

# Verify that the same result is obtained
print('==========================================')
print(count_by_weekday1.equals(count_by_weekday2))

         city  signups  visitors
weekday                         
Mon         2        2         2
Sun         2        2         2
True


In [46]:
# Using margins in pivot tables
# Create the DataFrame with the appropriate pivot table: signups_and_visitors
signups_and_visitors = users.pivot_table(index='weekday',aggfunc=sum)

# Print signups_and_visitors
print(signups_and_visitors)

# Add in the margins: signups_and_visitors_total 
signups_and_visitors_total = users.pivot_table(index='weekday',
                                                aggfunc=sum,
                                                margins=True)


# Print signups_and_visitors_total
print(signups_and_visitors_total)


         signups  visitors
weekday                   
Mon            8       782
Sun           19       376
         signups  visitors
weekday                   
Mon            8       782
Sun           19       376
All           27      1158
