# Examining the Data

In [83]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 6
pd.options.display.max_columns = 6

In [84]:
df = pd.read_hdf('data/beer.hdf','df')
df

Unnamed: 0,abv,beer_id,brewer_id,...,review_taste,text,time
0,7.0,2511,287,...,4.5,Batch 8144\tPitch black in color with a 1/2 f...,2009-10-05 21:31:48
1,5.7,19736,9790,...,4.0,Sampled from a 12oz bottle in a standard pint...,2009-10-05 21:32:09
2,4.8,11098,3182,...,3.5,Haystack yellow with an energetic group of bu...,2009-10-05 21:32:13
...,...,...,...,...,...,...,...
49997,8.1,21950,2372,...,4.5,Poured a light sucking crude oil beckoning bl...,2009-12-25 17:23:52
49998,4.6,5453,1306,...,3.5,"500ml brown bottle, 4.0% ABV. Pours a crystal...",2009-12-25 17:25:06
49999,9.4,47695,14879,...,4.5,"22 oz bottle poured into a flute glass, share...",2009-12-25 17:26:06


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 13 columns):
abv                  48389 non-null float64
beer_id              50000 non-null int64
brewer_id            50000 non-null int64
beer_name            50000 non-null object
beer_style           50000 non-null object
review_appearance    50000 non-null float64
review_aroma         50000 non-null float64
review_overall       50000 non-null float64
review_palate        50000 non-null float64
profile_name         50000 non-null object
review_taste         50000 non-null float64
text                 49991 non-null object
time                 50000 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(6), int64(2), object(4)
memory usage: 5.3+ MB


# Boolean indexing

Like a where clause in SQL. The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed.

In [86]:
df.abv < 5

0        False
1        False
2         True
         ...  
49997    False
49998     True
49999    False
Name: abv, dtype: bool

In [87]:
df[df.abv < 5].head()

Unnamed: 0,abv,beer_id,brewer_id,...,review_taste,text,time
2,4.8,11098,3182,...,3.5,Haystack yellow with an energetic group of bu...,2009-10-05 21:32:13
7,4.8,1669,256,...,4.5,"Ok, for starters great white I believe will b...",2009-10-05 21:34:29
21,4.6,401,118,...,4.0,"Poured into a snifter, revealing black opaque...",2009-10-05 21:47:36
22,4.9,5044,18968,...,4.0,A - a medium brown body with an off white hea...,2009-10-05 21:47:56
28,4.6,401,118,...,4.0,The color of this beer fits the name well. Op...,2009-10-05 21:53:38


Notice that we just used `[]` there. We can pass the boolean indexer in to `.loc` as well.

In [88]:
df.loc[df.abv < 5, ['beer_style', 'review_overall']]

Unnamed: 0,beer_style,review_overall
2,German Pilsener,3.0
7,Witbier,4.5
21,Scottish Ale,3.5
...,...,...
49976,Euro Pale Lager,3.0
49980,Herbed / Spiced Beer,4.0
49998,English Pale Ale,4.0


Again, you can get complicated

In [89]:
df[((df.abv < 5) & (df.time > pd.Timestamp('2009-06'))) | (df.review_overall >= 4.5)]

Unnamed: 0,abv,beer_id,brewer_id,...,review_taste,text,time
0,7.0,2511,287,...,4.5,Batch 8144\tPitch black in color with a 1/2 f...,2009-10-05 21:31:48
1,5.7,19736,9790,...,4.0,Sampled from a 12oz bottle in a standard pint...,2009-10-05 21:32:09
2,4.8,11098,3182,...,3.5,Haystack yellow with an energetic group of bu...,2009-10-05 21:32:13
...,...,...,...,...,...,...,...
49997,8.1,21950,2372,...,4.5,Poured a light sucking crude oil beckoning bl...,2009-12-25 17:23:52
49998,4.6,5453,1306,...,3.5,"500ml brown bottle, 4.0% ABV. Pours a crystal...",2009-12-25 17:25:06
49999,9.4,47695,14879,...,4.5,"22 oz bottle poured into a flute glass, share...",2009-12-25 17:26:06


# Creating a boolean indexer with the contents of a column

Select just the rows where the `beer_style` contains IPA. 

In [90]:
df[df.beer_style.str.contains('IPA')]

Unnamed: 0,abv,beer_id,brewer_id,...,review_taste,text,time
3,9.5,28577,3818,...,4.0,"The aroma has pine, wood, citrus, caramel, an...",2009-10-05 21:32:37
8,6.7,6549,140,...,4.0,I like all of Sierra Nevada's beers but felt ...,2009-10-05 21:34:31
16,8.0,36179,3818,...,3.0,"The aroma is papery with citrus, yeast, and s...",2009-10-05 21:43:23
...,...,...,...,...,...,...,...
49947,7.0,709,199,...,4.0,12oz into my pint glass.\t\tA: Golden honey a...,2009-12-25 08:48:11
49984,6.0,38388,3718,...,4.5,"I love this beer, seek it out when I am in Mo...",2009-12-25 16:25:45
49996,8.0,7971,863,...,4.0,This is a big hoppy monster of an IPA..If you...,2009-12-25 17:23:24


This is quite powerful. Any method that returns a boolean array is potentially an indexer.

# isin

Useful for seeing if a value is contained in a collection.

In [91]:
beer_ids = df.beer_id.value_counts()
beer_ids

1904     240
53863    208
52441    158
        ... 
41285      1
47430      1
53274      1
Name: beer_id, dtype: int64

In [92]:
df[df.beer_id.isin(beer_ids[0:3].index)]

Unnamed: 0,abv,beer_id,brewer_id,...,review_taste,text,time
142,8.6,52441,147,...,4.5,Poured in to a chimay goblet.\t\tPours black ...,2009-10-06 00:10:46
446,8.6,52441,147,...,3.5,Pours pitch black and completely opaque witho...,2009-10-06 07:59:01
714,8.6,52441,147,...,3.5,Near black pour with a mountainous 3 finger h...,2009-10-06 21:31:06
...,...,...,...,...,...,...,...
49715,6.8,1904,140,...,2.5,"I've tried this beer on tap about a year ago,...",2009-12-25 01:13:38
49844,8.6,52441,147,...,4.0,thanks to colonelforbin for this one!\t\tPour...,2009-12-25 04:10:49
49863,5.5,53863,28,...,4.0,12 oz bottle poured into a pint glass \t\tPou...,2009-12-25 04:38:07


# Positional Indexing

In [99]:
df.head(5)

Unnamed: 0,abv,beer_id,brewer_id,...,review_taste,text,time
0,7.0,2511,287,...,4.5,Batch 8144\tPitch black in color with a 1/2 f...,2009-10-05 21:31:48
1,5.7,19736,9790,...,4.0,Sampled from a 12oz bottle in a standard pint...,2009-10-05 21:32:09
2,4.8,11098,3182,...,3.5,Haystack yellow with an energetic group of bu...,2009-10-05 21:32:13
3,9.5,28577,3818,...,4.0,"The aroma has pine, wood, citrus, caramel, an...",2009-10-05 21:32:37
4,5.8,398,119,...,3.0,A: Pours a slightly hazy golden/orange color....,2009-10-05 21:33:14


In [101]:
df.iloc[[2,5,10]]

Unnamed: 0,abv,beer_id,brewer_id,...,review_taste,text,time
2,4.8,11098,3182,...,3.5,Haystack yellow with an energetic group of bu...,2009-10-05 21:32:13
5,7.0,966,365,...,4.5,"From notes. Pours black, thin mocha head fade...",2009-10-05 21:33:48
10,11.8,43670,423,...,4.5,"Burnt amber in color with a 1/4"" head. Aroma ...",2009-10-05 21:36:03


In [103]:
df.iloc[[2,5,10],0:3]

Unnamed: 0,abv,beer_id,brewer_id
2,4.8,11098,3182
5,7.0,966,365
10,11.8,43670,423


In [105]:
df.ix[[2,5,10],['beer_id','time']]

Unnamed: 0,beer_id,time
2,11098,2009-10-05 21:32:13
5,966,2009-10-05 21:33:48
10,43670,2009-10-05 21:36:03


# Hierarchical Indexing

One of the most powerful and most complicated features of pandas.
Let's you represent high-dimensional datasets in a table.

In [93]:
reviews = df.set_index(['profile_name', 'beer_id', 'time']).sort_index()
reviews.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,abv,brewer_id,beer_name,...,review_palate,review_taste,text
profile_name,beer_id,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01121987,29077,2009-11-30 03:44:42,9.0,11256,Corne De Brume,...,4,3.5,"Poured into a belgian beer glass, not great h..."
05Harley,1307,2009-10-06 00:10:06,8.5,428,Der Weisse Bock,...,4,4.0,Can't find the date on this one.\t\tPurchased...
05Harley,2732,2009-12-12 01:21:36,8.0,287,Bell's Consecrator Doppelbock,...,4,4.5,Bottle # 8881 (02/09)\t\tPurchased through We...
05Harley,2899,2009-10-20 22:27:01,7.1,911,Andechser Doppelbock Dunkel,...,4,5.0,Bottle # 300310\t\tPurchased through Kracked ...
05Harley,3054,2009-11-21 02:17:41,5.0,946,Piton Lager Beer,...,2,3.0,Bottled in 2007.\t\tPurchased in St. Lucia @ ...


### Why is this useful

In [94]:
top_reviewers = (reviews.index.get_level_values('profile_name')
                 .value_counts()
                 .head(5))
top_reviewers

drabmuh           242
corby112          230
BeerFMAndy        202
northyorksammy    201
mrmanning         187
Name: profile_name, dtype: int64

In [95]:
reviews.loc[top_reviewers.index, :, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,abv,brewer_id,beer_name,...,review_palate,review_taste,text
profile_name,beer_id,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BeerFMAndy,92,2009-12-24 21:51:46,7.20,147,Arrogant Bastard Ale,...,4.0,4.0,22 oz bottle poured into a Sierra Nevada Impe...
BeerFMAndy,100,2009-10-22 03:39:21,5.70,306,Blue Moon Harvest Moon Pumpkin Ale,...,2.5,2.0,12 oz bottle poured into an Imperial Pint. Ma...
BeerFMAndy,101,2009-12-06 00:44:10,5.50,35,Samuel Adams Winter Lager,...,4.0,3.5,"12 oz bottle poured into an Imperial pint. ""A..."
...,...,...,...,...,...,...,...,...,...
northyorksammy,54916,2009-12-21 14:46:56,3.25,2097,Kuhnhenn Wild Child,...,3.0,2.5,2009 WinterSolstice. Some significant butterc...
northyorksammy,54918,2009-12-21 14:56:29,12.00,2097,Kuhnhenn Bonicci Barley Wine,...,3.5,3.5,"Winner of the amateur brewing fest,Jerry Boni..."
northyorksammy,54919,2009-12-21 14:59:39,7.00,2097,Kuhnhenn Foreign Export Stout,...,4.0,3.5,"Thick stout, a study in chocolate. Very smoot..."


### Specifying multi-axis indexers

In [96]:
idx = pd.IndexSlice

In [97]:
reviews.loc[idx[top_reviewers.index, [92,54919], :], ['beer_name','beer_style']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beer_name,beer_style
profile_name,beer_id,time,Unnamed: 3_level_1,Unnamed: 4_level_1
BeerFMAndy,92,2009-12-24 21:51:46,Arrogant Bastard Ale,American Strong Ale
northyorksammy,54919,2009-12-21 14:59:39,Kuhnhenn Foreign Export Stout,Foreign / Export Stout


In [98]:
reviews.query('profile_name in @top_reviewers.index & beer_id in [92,54919]')[['beer_name','beer_style']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beer_name,beer_style
profile_name,beer_id,time,Unnamed: 3_level_1,Unnamed: 4_level_1
BeerFMAndy,92,2009-12-24 21:51:46,Arrogant Bastard Ale,American Strong Ale
northyorksammy,54919,2009-12-21 14:59:39,Kuhnhenn Foreign Export Stout,Foreign / Export Stout
