# Module 9: Restructuring Data into Tidy Form


In [1]:
import pandas as pd
import numpy as np

## 9.1. Tidying when multiple variables are stored as column names

In [2]:
weightlifting = pd.read_csv('data/weightlifting_men.csv')
weightlifting

Unnamed: 0,Weight Category,M35 35-39,M40 40-44,M45 45-49,M50 50-54,M55 55-59,M60 60-64,M65 65-69,M70 70-74,M75 75-79,M80 80+
0,56,137,130,125,115,102,92,80,67,62,55
1,62,152,145,137,127,112,102,90,75,67,57
2,69,167,160,150,140,125,112,97,82,75,60
3,77,182,172,165,150,135,122,107,90,82,65
4,85,192,182,175,160,142,130,112,95,87,70
5,94,202,192,182,167,150,137,120,100,90,75
6,105,210,200,190,175,157,142,122,102,95,80
7,105+,217,207,197,182,165,150,127,107,100,85


In [3]:
wl_melt = weightlifting.melt(id_vars='Weight Category', 
                             var_name='sex_age', 
                             value_name='Qual Total')
wl_melt.head()

Unnamed: 0,Weight Category,sex_age,Qual Total
0,56,M35 35-39,137
1,62,M35 35-39,152
2,69,M35 35-39,167
3,77,M35 35-39,182
4,85,M35 35-39,192


In [4]:
# Now we will try to extract sex and age info from age_sex column and put it in tow separate columns 
# titled 'sex' and Age Group'
# We will use .str.split method on column 'sex_age'

sex_age = wl_melt['sex_age'].str.split(expand=True)
sex_age.head()

Unnamed: 0,0,1
0,M35,35-39
1,M35,35-39
2,M35,35-39
3,M35,35-39
4,M35,35-39


In [5]:
sex_age.columns = ['Sex', 'Age Group']
sex_age.head()

Unnamed: 0,Sex,Age Group
0,M35,35-39
1,M35,35-39
2,M35,35-39
3,M35,35-39
4,M35,35-39


In [6]:
sex_age['Sex'] = sex_age['Sex'].str[0]
sex_age.head()

Unnamed: 0,Sex,Age Group
0,M,35-39
1,M,35-39
2,M,35-39
3,M,35-39
4,M,35-39


In [7]:
wl_cat_total = wl_melt[['Weight Category', 'Qual Total']]
wl_tidy = pd.concat([sex_age, wl_cat_total], axis='columns')
wl_tidy.head()

Unnamed: 0,Sex,Age Group,Weight Category,Qual Total
0,M,35-39,56,137
1,M,35-39,62,152
2,M,35-39,69,167
3,M,35-39,77,182
4,M,35-39,85,192


## 9.2. Tidying when multiple variables are stored as column values

In [8]:
inspections = pd.read_csv('data/restaurant_inspections.csv', parse_dates=['Date'])
inspections.head(10)

Unnamed: 0,Name,Date,Info,Value
0,E & E Grill House,2017-08-08,Borough,MANHATTAN
1,E & E Grill House,2017-08-08,Cuisine,American
2,E & E Grill House,2017-08-08,Description,Non-food contact surface improperly constructe...
3,E & E Grill House,2017-08-08,Grade,A
4,E & E Grill House,2017-08-08,Score,9.0
5,PIZZA WAGON,2017-04-12,Borough,BROOKLYN
6,PIZZA WAGON,2017-04-12,Cuisine,Pizza
7,PIZZA WAGON,2017-04-12,Description,"Food contact surface not properly washed, rins..."
8,PIZZA WAGON,2017-04-12,Grade,A
9,PIZZA WAGON,2017-04-12,Score,10.0


In [9]:
# creating index

inspections.set_index(['Name','Date', 'Info']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Value
Name,Date,Info,Unnamed: 3_level_1
E & E Grill House,2017-08-08,Borough,MANHATTAN
E & E Grill House,2017-08-08,Cuisine,American
E & E Grill House,2017-08-08,Description,Non-food contact surface improperly constructe...
E & E Grill House,2017-08-08,Grade,A
E & E Grill House,2017-08-08,Score,9.0


In [10]:
# Unstacking based on 'Info' column

inspections.set_index(['Name','Date', 'Info']).unstack('Info').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Info,Borough,Cuisine,Description,Grade,Score
Name,Date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0


In [11]:
insp_tidy = inspections.set_index(['Name','Date', 'Info']) \
                               .unstack('Info') \
                               .reset_index(col_level=-1)
insp_tidy.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Value,Value,Value,Value,Value
Info,Name,Date,Borough,Cuisine,Description,Grade,Score
0,3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
1,A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
2,AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
3,ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
4,BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0


In [12]:
insp_tidy.columns = insp_tidy.columns.droplevel(0).rename(None)
insp_tidy.head()

Unnamed: 0,Name,Date,Borough,Cuisine,Description,Grade,Score
0,3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
1,A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
2,AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
3,ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
4,BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0


## 9.3 Tidying when two or more values are stored in the same cell

In [13]:
cities = pd.read_csv('data/texas_cities.csv')
cities

Unnamed: 0,City,Geolocation
0,Houston,"29.7604° N, 95.3698° W"
1,Dallas,"32.7767° N, 96.7970° W"
2,Austin,"30.2672° N, 97.7431° W"


In [14]:
# splitting at pattern involving any character followed by space
geolocations = cities.Geolocation.str.split(pat='. ', expand=True)
geolocations.columns = ['latitude', 'latitude direction', 'longitude', 'longitude direction']
geolocations

Unnamed: 0,latitude,latitude direction,longitude,longitude direction
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


In [15]:
geolocations = geolocations.astype({'latitude':'float', 'longitude':'float'})
geolocations.dtypes

latitude               float64
latitude direction      object
longitude              float64
longitude direction     object
dtype: object

In [16]:
# concatenating cities['City'] variable with geolocations dataFrame

cities_tidy = pd.concat([cities['City'], geolocations], axis='columns')
cities_tidy

Unnamed: 0,City,latitude,latitude direction,longitude,longitude direction
0,Houston,29.7604,N,95.3698,W
1,Dallas,32.7767,N,96.797,W
2,Austin,30.2672,N,97.7431,W


In [17]:
pd.concat([cities['City'], geolocations], axis='columns')

Unnamed: 0,City,latitude,latitude direction,longitude,longitude direction
0,Houston,29.7604,N,95.3698,W
1,Dallas,32.7767,N,96.797,W
2,Austin,30.2672,N,97.7431,W


In [18]:
cities.Geolocation.str.split(pat='° |, ', expand=True)

Unnamed: 0,0,1,2,3
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


## 9.4 Tidying when variables are stored in column names and values

In [19]:
sensors = pd.read_csv('data/sensors.csv')
sensors

Unnamed: 0,Group,Property,2012,2013,2014,2015,2016
0,A,Pressure,928,873,814,973,870
1,A,Temperature,1026,1038,1009,1036,1042
2,A,Flow,819,806,861,882,856
3,B,Pressure,817,877,914,806,942
4,B,Temperature,1008,1041,1009,1002,1013
5,B,Flow,887,899,837,824,873


In [20]:
sensors_melted = sensors.melt(id_vars=['Group', 'Property'], var_name='Year') 
sensors_melted.head(10)

Unnamed: 0,Group,Property,Year,value
0,A,Pressure,2012,928
1,A,Temperature,2012,1026
2,A,Flow,2012,819
3,B,Pressure,2012,817
4,B,Temperature,2012,1008
5,B,Flow,2012,887
6,A,Pressure,2013,873
7,A,Temperature,2013,1038
8,A,Flow,2013,806
9,B,Pressure,2013,877


In [21]:
sensors_pivoted=sensors_melted.pivot_table(index=['Group', 'Year'], columns='Property', values='value') 
sensors_pivoted.head()

Unnamed: 0_level_0,Property,Flow,Pressure,Temperature
Group,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,2012,819,928,1026
A,2013,806,873,1038
A,2014,861,814,1009
A,2015,882,973,1036
A,2016,856,870,1042


In [22]:
# Removing multiIndex

sensors_reset = sensors_pivoted.reset_index() 
sensors_reset.head(10)

Property,Group,Year,Flow,Pressure,Temperature
0,A,2012,819,928,1026
1,A,2013,806,873,1038
2,A,2014,861,814,1009
3,A,2015,882,973,1036
4,A,2016,856,870,1042
5,B,2012,887,817,1008
6,B,2013,899,877,1041
7,B,2014,837,914,1009
8,B,2015,824,806,1002
9,B,2016,873,942,1013


In [23]:
sensors_tidy = sensors_reset.rename_axis(None, axis='columns')
sensors_tidy.head(10)

Unnamed: 0,Group,Year,Flow,Pressure,Temperature
0,A,2012,819,928,1026
1,A,2013,806,873,1038
2,A,2014,861,814,1009
3,A,2015,882,973,1036
4,A,2016,856,870,1042
5,B,2012,887,817,1008
6,B,2013,899,877,1041
7,B,2014,837,914,1009
8,B,2015,824,806,1002
9,B,2016,873,942,1013


In [24]:
sensors.melt(id_vars=['Group', 'Property'], var_name='Year') \
       .pivot_table(index=['Group', 'Year'], columns='Property', values='value') \
       .reset_index() \
       .rename_axis(None, axis='columns')

Unnamed: 0,Group,Year,Flow,Pressure,Temperature
0,A,2012,819,928,1026
1,A,2013,806,873,1038
2,A,2014,861,814,1009
3,A,2015,882,973,1036
4,A,2016,856,870,1042
5,B,2012,887,817,1008
6,B,2013,899,877,1041
7,B,2014,837,914,1009
8,B,2015,824,806,1002
9,B,2016,873,942,1013


## 9.5 Tidying when multiple observational units are stored in the same table

In [42]:
movie = pd.read_csv('data/movie_altered.csv')
movie.head()

Unnamed: 0,title,rating,year,duration,director_1,director_fb_likes_1,actor_1,actor_2,actor_3,actor_fb_likes_1,actor_fb_likes_2,actor_fb_likes_3
0,Avatar,PG-13,2009.0,178.0,James Cameron,0.0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,PG-13,2007.0,169.0,Gore Verbinski,563.0,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,PG-13,2015.0,148.0,Sam Mendes,0.0,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,PG-13,2012.0,164.0,Christopher Nolan,22000.0,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,,,,Doug Walker,131.0,Doug Walker,Rob Walker,,131.0,12.0,


In [43]:
# selecting director Christopher Nolan to check that his facebook likes are independent of the movie

criteria  = movie.director_1 == 'Christopher Nolan'
select_cols = ['title', 'director_1', 'director_fb_likes_1']
movie.loc[criteria, select_cols]



Unnamed: 0,title,director_1,director_fb_likes_1
3,The Dark Knight Rises,Christopher Nolan,22000.0
66,The Dark Knight,Christopher Nolan,22000.0
96,Interstellar,Christopher Nolan,22000.0
97,Inception,Christopher Nolan,22000.0
120,Batman Begins,Christopher Nolan,22000.0
1057,Insomnia,Christopher Nolan,22000.0
1222,The Prestige,Christopher Nolan,22000.0
3646,Memento,Christopher Nolan,22000.0


In [44]:
movie.insert(0, 'id', np.arange(len(movie)))
movie.head()

Unnamed: 0,id,title,rating,year,duration,director_1,director_fb_likes_1,actor_1,actor_2,actor_3,actor_fb_likes_1,actor_fb_likes_2,actor_fb_likes_3
0,0,Avatar,PG-13,2009.0,178.0,James Cameron,0.0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,1,Pirates of the Caribbean: At World's End,PG-13,2007.0,169.0,Gore Verbinski,563.0,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,2,Spectre,PG-13,2015.0,148.0,Sam Mendes,0.0,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,3,The Dark Knight Rises,PG-13,2012.0,164.0,Christopher Nolan,22000.0,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,4,Star Wars: Episode VII - The Force Awakens,,,,Doug Walker,131.0,Doug Walker,Rob Walker,,131.0,12.0,


In [45]:
stubnames = ['director', 'director_fb_likes', 'actor', 'actor_fb_likes']
movie_long = pd.wide_to_long(movie, 
                                 stubnames=stubnames, 
                                 i='id', 
                                 j='num', 
                                 sep='_').reset_index()
movie_long['num'] = movie_long['num'].astype(int)
movie_long.head(9)

Unnamed: 0,id,num,rating,title,duration,year,director,director_fb_likes,actor,actor_fb_likes
0,0,1,PG-13,Avatar,178.0,2009.0,James Cameron,0.0,CCH Pounder,1000.0
1,0,2,PG-13,Avatar,178.0,2009.0,,,Joel David Moore,936.0
2,0,3,PG-13,Avatar,178.0,2009.0,,,Wes Studi,855.0
3,1,1,PG-13,Pirates of the Caribbean: At World's End,169.0,2007.0,Gore Verbinski,563.0,Johnny Depp,40000.0
4,1,2,PG-13,Pirates of the Caribbean: At World's End,169.0,2007.0,,,Orlando Bloom,5000.0
5,1,3,PG-13,Pirates of the Caribbean: At World's End,169.0,2007.0,,,Jack Davenport,1000.0
6,2,1,PG-13,Spectre,148.0,2015.0,Sam Mendes,0.0,Christoph Waltz,11000.0
7,2,2,PG-13,Spectre,148.0,2015.0,,,Rory Kinnear,393.0
8,2,3,PG-13,Spectre,148.0,2015.0,,,Stephanie Sigman,161.0


In [5]:
movie_table = movie_long[['id','title', 'year', 'duration', 'rating']]
director_table = movie_long[['id', 'director', 'num', 'director_fb_likes']]
actor_table = movie_long[['id', 'actor', 'num', 'actor_fb_likes']]

In [6]:
movie_table.head(9)

Unnamed: 0,id,title,year,duration,rating
0,0,Avatar,2009.0,178.0,PG-13
1,0,Avatar,2009.0,178.0,PG-13
2,0,Avatar,2009.0,178.0,PG-13
3,1,Pirates of the Caribbean: At World's End,2007.0,169.0,PG-13
4,1,Pirates of the Caribbean: At World's End,2007.0,169.0,PG-13
5,1,Pirates of the Caribbean: At World's End,2007.0,169.0,PG-13
6,2,Spectre,2015.0,148.0,PG-13
7,2,Spectre,2015.0,148.0,PG-13
8,2,Spectre,2015.0,148.0,PG-13


In [7]:
director_table.head(9)

Unnamed: 0,id,director,num,director_fb_likes
0,0,James Cameron,1,0.0
1,0,,2,
2,0,,3,
3,1,Gore Verbinski,1,563.0
4,1,,2,
5,1,,3,
6,2,Sam Mendes,1,0.0
7,2,,2,
8,2,,3,


In [8]:
actor_table.head(9)

Unnamed: 0,id,actor,num,actor_fb_likes
0,0,CCH Pounder,1,1000.0
1,0,Joel David Moore,2,936.0
2,0,Wes Studi,3,855.0
3,1,Johnny Depp,1,40000.0
4,1,Orlando Bloom,2,5000.0
5,1,Jack Davenport,3,1000.0
6,2,Christoph Waltz,1,11000.0
7,2,Rory Kinnear,2,393.0
8,2,Stephanie Sigman,3,161.0


In [50]:
movie_table.isnull().sum()

id            0
title         0
year        106
duration     15
rating      300
dtype: int64

In [65]:
movie_table = movie_table.drop_duplicates().reset_index(drop=True)
director_table = director_table.dropna().reset_index(drop=True)
actor_table = actor_table.dropna().reset_index(drop=True)

In [10]:
movie_table.head()

Unnamed: 0,id,title,year,duration,rating
0,0,Avatar,2009.0,178.0,PG-13
1,1,Pirates of the Caribbean: At World's End,2007.0,169.0,PG-13
2,2,Spectre,2015.0,148.0,PG-13
3,3,The Dark Knight Rises,2012.0,164.0,PG-13
4,4,Star Wars: Episode VII - The Force Awakens,,,


In [66]:
director_table.head()

Unnamed: 0,id,director_id,director,num,director_fb_likes
0,0,922,James Cameron,1,0.0
1,1,794,Gore Verbinski,1,563.0
2,2,2020,Sam Mendes,1,0.0
3,3,373,Christopher Nolan,1,22000.0
4,4,600,Doug Walker,1,131.0
