# Tidyr a la Python

taken from: https://www.ibm.com/developerworks/community/blogs/jfp/entry/Tidy_Data_In_Python?lang=en

In [2]:
import pandas as pd
import numpy as np

messy = pd.DataFrame({'First' : ['John', 'Jane', 'Mary'], 
                      'Last' : ['Smith', 'Doe', 'Johnson'], 
                      'Treatment A' : [np.nan, 16, 3], 
                      'Treatment B' : [2, 11, 1]})

## Problem 1: There are value-columns. They should be represented by one variable for all of them. In R, we'd use gather(). Let's see what we can do in Python:

In [4]:
messy

Unnamed: 0,First,Last,Treatment A,Treatment B
0,John,Smith,,2
1,Jane,Doe,16.0,11
2,Mary,Johnson,3.0,1


To do the equivalent of gather(), we need to say what variables are real variables; what variables are value-variables, and how we want to record the values in the latter ones. Thus

In [5]:
tidy = pd.melt(messy, 
               id_vars=['First','Last'], 
               var_name='treatment', 
               value_name='result')
tidy

Unnamed: 0,First,Last,treatment,result
0,John,Smith,Treatment A,
1,Jane,Doe,Treatment A,16.0
2,Mary,Johnson,Treatment A,3.0
3,John,Smith,Treatment B,2.0
4,Jane,Doe,Treatment B,11.0
5,Mary,Johnson,Treatment B,1.0


A further example:

In [7]:
messy = pd.DataFrame({'Agnostic' : [27, 34, 60, 81, 76, 137],
                      'Atheist' : [12, 27, 37, 52, 35, 70],
                      'Buddhist' : [27, 21, 30, 34, 33, 58],
                      'Catholic' : [418, 617, 732, 670, 638, 1116],
                      "Don't know/refused" : [15, 14, 15, 11, 10, 35],
                      'Evangelical Prot' : [575, 869, 1064, 982, 881, 1486],
                      'Hindu' : [1, 9, 7, 9, 11, 34],
                      'Historically Black Prot' : [228, 244, 236, 238, 197, 223],
                      "Jehovah's Witness" : [20, 27, 24, 24, 21, 30],
                      'Jewish' : [19, 19, 25, 25, 30, 95],
                     })
def transpose(df, columns):
    df = df.T.copy()
    df.reset_index(inplace=True)
    df.columns = columns
    return df

messy = transpose(messy, ['religion', '<$10k', '$10-20k', '$20-30k', '$30-40k', '$40-50k', '$50-75k'])

messy


Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k
0,Agnostic,27,34,60,81,76,137
1,Atheist,12,27,37,52,35,70
2,Buddhist,27,21,30,34,33,58
3,Catholic,418,617,732,670,638,1116
4,Don't know/refused,15,14,15,11,10,35
5,Evangelical Prot,575,869,1064,982,881,1486
6,Hindu,1,9,7,9,11,34
7,Historically Black Prot,228,244,236,238,197,223
8,Jehovah's Witness,20,27,24,24,21,30
9,Jewish,19,19,25,25,30,95


In [10]:
tidy = pd.melt(messy, id_vars = 'religion', var_name = 'Income_Bracket', value_name = 'Frequency' )
# The equivalent of arrange in python is sort_values
tidy.sort_values(by=['religion'], inplace=True)
tidy.head()

Unnamed: 0,religion,Income_Bracket,Frequency
0,Agnostic,<$10k,27
30,Agnostic,$30-40k,81
40,Agnostic,$40-50k,76
50,Agnostic,$50-75k,137
10,Agnostic,$10-20k,34
