# Pivot Tables and Crosstabs

In [3]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
import matplotlib.pyplot as plt
%matplotlib inline


Vegas = pd.read_csv('vegas.csv')
Vegas.columns = Vegas.columns.str.replace('\.*\s+', '_').str.lower()
Vegas.columns

  Vegas.columns = Vegas.columns.str.replace('\.*\s+', '_').str.lower()


Index(['user_country', 'nr_reviews', 'nr_hotel_reviews', 'helpful_votes',
       'score', 'period_of_stay', 'traveler_type', 'pool', 'gym',
       'tennis_court', 'spa', 'casino', 'free_internet', 'hotel_name',
       'hotel_stars', 'nr_rooms', 'user_continent', 'member_years',
       'review_month', 'review_weekday'],
      dtype='object')

![image.png](attachment:fe3e7fe8-b3e9-43f5-a0b2-7807cbab1f4e.png)

In [2]:
# simple application of GROUPBY and AGG

In [4]:
# DataFrame.pivot_table()

In [6]:
# pd.pivot_table()


---

In [7]:
H = pd.read_csv("gnhotels01.csv")

In [8]:
H

Unnamed: 0,hotel_name,mean_score,percent_non_usa,gym,tennis_court,spa,casino,free_internet,hotel_stars,nr_rooms
0,Bellagio Las Vegas,4.21,0.67,YES,NO,YES,YES,YES,5,3933
1,Caesars Palace,4.12,0.42,YES,NO,YES,YES,YES,5,3348
2,Circus Circus Hotel & Casino Las Vegas,3.21,0.58,YES,NO,NO,YES,YES,3,3773
3,Encore at wynn Las Vegas,4.54,0.42,YES,NO,YES,YES,YES,5,2034
4,Excalibur Hotel & Casino,3.71,0.75,YES,NO,YES,YES,YES,3,3981
5,Hilton Grand Vacations at the Flamingo,3.96,0.5,YES,NO,NO,NO,YES,3,315
6,Hilton Grand Vacations on the Boulevard,4.17,0.58,YES,NO,YES,YES,YES,35,1228
7,Marriott's Grand Chateau,4.54,0.67,YES,NO,NO,YES,YES,35,732
8,Monte Carlo Resort&Casino,3.29,0.75,YES,NO,YES,YES,NO,4,3003
9,Paris Las Vegas,4.04,0.38,YES,NO,YES,YES,YES,4,2916


In [9]:
# change index

In [10]:
H = pd.read_csv("gnhotels01.csv", index_col = "hotel_name")

In [13]:
H.head(5)

Unnamed: 0_level_0,mean_score,percent_non_usa,gym,tennis_court,spa,casino,free_internet,hotel_stars,nr_rooms
hotel_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bellagio Las Vegas,4.21,0.67,YES,NO,YES,YES,YES,5,3933
Caesars Palace,4.12,0.42,YES,NO,YES,YES,YES,5,3348
Circus Circus Hotel & Casino Las Vegas,3.21,0.58,YES,NO,NO,YES,YES,3,3773
Encore at wynn Las Vegas,4.54,0.42,YES,NO,YES,YES,YES,5,2034
Excalibur Hotel & Casino,3.71,0.75,YES,NO,YES,YES,YES,3,3981


---

# Create Pivot Table

### 3 aspects

In [17]:
# 1.. which is the mean (aggregation function)

# 2. what is the key for the y axis

# 3. key for the columns

In [15]:
# WRONG

# H = pd.pivot_table(values = 'mean_score', index = 'tennis_court', columns = 'spa')

TypeError: pivot_table() missing 1 required positional argument: 'data'

In [16]:
H.pivot_table(values = 'mean_score', index = 'tennis_court', columns = 'spa')

spa,NO,YES
tennis_court,Unnamed: 1_level_1,Unnamed: 2_level_1
NO,3.95,4.13
YES,4.38,4.21


---

## pass in list of keys

In [20]:
# H.pivot_table(values = 'mean_score', index = ['tennis_court', 'pool'], columns = ['spa', 'hotel_stars'])

# error (as pool is not present)

In [21]:
H.pivot_table(values = 'mean_score', index = ['tennis_court', 'casino'], columns = ['spa', 'hotel_stars'])

Unnamed: 0_level_0,spa,NO,NO,NO,YES,YES,YES,YES
Unnamed: 0_level_1,hotel_stars,3,"3,5","4,5",3,"3,5",4,5
tennis_court,casino,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
NO,NO,3.96,,,,,,
NO,YES,3.21,4.54,4.08,3.71,4.17,3.75,4.35
YES,NO,,4.38,,,,,
YES,YES,,,,4.21,,4.0,4.62


---

In [22]:
H.pivot_table(values = 'mean_score', index = ['tennis_court', 'casino'], 
              columns = ['spa', 'hotel_stars'], margins = True)

Unnamed: 0_level_0,spa,NO,NO,NO,YES,YES,YES,YES,All
Unnamed: 0_level_1,hotel_stars,3,"3,5","4,5",3,"3,5",4,5,Unnamed: 9_level_1
tennis_court,casino,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
NO,NO,3.96,,,,,,,3.96
NO,YES,3.21,4.54,4.08,3.71,4.17,3.75,4.35,4.09
YES,NO,,4.38,,,,,,4.38
YES,YES,,,,4.21,,4.0,4.62,4.21
All,,3.58,4.46,4.08,3.96,4.17,3.85,4.39,4.12


In [23]:
# added ALL for rows and columns

# the pivot table user mean as aggregation function by default

In [25]:
H.pivot_table(values = 'mean_score', index = 'tennis_court', columns = 'spa', aggfunc = min)

spa,NO,YES
tennis_court,Unnamed: 1_level_1,Unnamed: 2_level_1
NO,3.21,3.29
YES,4.38,3.96


In [26]:
H.pivot_table(values = 'mean_score', index = 'tennis_court', columns = 'spa', aggfunc = count)

NameError: name 'count' is not defined

In [27]:
H.pivot_table(values = 'mean_score', index = 'tennis_court', columns = 'spa', aggfunc = 'count')

spa,NO,YES
tennis_court,Unnamed: 1_level_1,Unnamed: 2_level_1
NO,4,12
YES,1,4


# <font color = red> A cross tab is a special type of pivot table

---

# you can also get cross tabs directly from pandas

In [28]:
pd.crosstab(H.tennis_court, H.spa)

spa,NO,YES
tennis_court,Unnamed: 1_level_1,Unnamed: 2_level_1
NO,4,12
YES,1,4
