In [1]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# reading in beer_reviews_data and storing it under variable df
df = pd.read_csv('data/beer_reviews_data.csv')

# displaying a random sample of the dataset utilizing the .sample() function
df.sample(n = 10, random_state = 42)

Unnamed: 0,beer_ABV,beer_beerId,beer_brewerId,beer_name,beer_style,review_appearance,review_palette,review_overall,review_taste,review_profileName,review_aroma,review_text,review_time
116087,5.8,234,76,Hacker-Pschorr Oktoberfest-M�rzen,M�rzen / Oktoberfest,4.0,3.5,4.0,4.5,divineaudio,4.5,pint glass. a - pours a clear amber liquid wit...,1317698982
218685,8.7,1446,158,Hibernation Ale,Old Ale,4.5,4.5,4.0,3.5,Cylinsier,3.0,From a 12 oz. Pours a maple syrup brown that g...,1262818901
63854,11.0,74491,16866,4 Calling Birds,Belgian Strong Dark Ale,4.0,3.0,3.0,3.5,womencantsail,3.0,On-tap at the Bruery Almost black with a dense...,1321823635
234370,5.0,43341,1628,Southampton Altbier,Altbier,2.5,4.0,4.0,3.5,DerekP,3.5,"According to the label, a ""Dusseldorf-Style Br...",1280711924
501494,9.0,40187,132,XS Imperial Red,American Amber / Red Ale,4.0,4.0,3.0,3.0,Gueuzedude,3.5,"Sampled on Tap at 1702, Tucson AZ; August 2008...",1219009004
246729,9.0,2512,215,Chimay Grande R�serve (Blue),Belgian Strong Dark Ale,3.5,5.0,4.5,4.0,mentor,4.0,Three finger dead tan head over a cloudy dark ...,1195591313
489422,5.3,353,132,Mocha Porter,American Porter,4.5,4.0,4.0,4.0,Stubbie1,4.0,"Appearance: Poured black, finished with a nice...",1041482821
243405,11.0,7975,48,Bi�re Du Boucanier Golden,Belgian Strong Pale Ale,3.5,4.5,3.0,3.0,Infamous7100,3.5,"Consumed this from a snifter, as you should wi...",1239405638
4124,8.4,52211,14879,Frog's Hollow Double Pumpkin Ale,Pumpkin Ale,4.0,3.5,3.0,3.5,Huhzubendah,3.5,Bottle shared by Chaney. Thanks Brandon! A: Th...,1286322675
479954,5.9,17300,9529,Pale Ale,American Pale Ale (APA),4.5,4.0,3.5,4.0,belgaridub,4.5,This came to the table a dark red-brown color....,1147897647


In [2]:
# understanding the data landscape of the dataset (dtype, non-null values and memory usage)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528870 entries, 0 to 528869
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   beer_ABV            508590 non-null  float64
 1   beer_beerId         528870 non-null  int64  
 2   beer_brewerId       528870 non-null  int64  
 3   beer_name           528870 non-null  object 
 4   beer_style          528870 non-null  object 
 5   review_appearance   528870 non-null  float64
 6   review_palette      528870 non-null  float64
 7   review_overall      528870 non-null  float64
 8   review_taste        528870 non-null  float64
 9   review_profileName  528755 non-null  object 
 10  review_aroma        528870 non-null  float64
 11  review_text         528751 non-null  object 
 12  review_time         528870 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 52.5+ MB


In [3]:
# looking at descriptive statistics for all numericl columns
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
beer_ABV,508590.0,7.017442,2.20446,0.01,5.3,6.5,8.5,57.7
beer_beerId,528870.0,22098.47,22158.28,3.0,1745.0,14368.0,40528.0,77310.0
beer_brewerId,528870.0,2598.423,5281.805,1.0,132.0,394.0,1475.0,27980.0
review_appearance,528870.0,3.864522,0.6040104,0.0,3.5,4.0,4.0,5.0
review_palette,528870.0,3.758926,0.6853349,1.0,3.5,4.0,4.0,5.0
review_overall,528870.0,3.833197,0.7099618,0.0,3.5,4.0,4.5,5.0
review_taste,528870.0,3.765993,0.6690176,1.0,3.5,4.0,4.0,5.0
review_aroma,528870.0,3.81735,0.718903,1.0,3.5,4.0,4.5,5.0
review_time,528870.0,1224885000.0,76056000.0,884390400.0,1174613000.0,1240366000.0,1288560000.0,1326277000.0


In [4]:
# lastly understanding the dimensionality of the dataset
df.shape

(528870, 13)

### Dimensionality Results

+ This dataset contains 528,870 beer reviews
+ it has 13 columns 

### Another question

+ How many unique users make up the 528,870 reviews?

### Question 1: Which breweries produce the strongest beer?

#### Approach

+ In order to measure the strongest beers, we need to calculate the average ABV (Alcohol by Volume)
+ Therefore, group by the brewer ID, calculate the mean beer ABV and sort it in descending order
+ Using the .rank() function, then we can filter out for the topm 3 strongest Breweries

In [13]:
# grouping the dataset by brewer_abv and calculating the average alcohol by volume
df_abv = df.groupby('beer_brewerId')['beer_ABV'].mean().reset_index().sort_values(by = 'beer_ABV', ascending = False)

# rounding the average to the second decimal point and filling any null values with 0 for cleaner data
df_abv['beer_ABV'] = round(df_abv['beer_ABV'], 2).fillna(0)

# renaming the column for better readability
df_abv = df_abv.rename({"beer_ABV" : "avg_ABV"}, axis = 1)

# using the rank function to determine the top 3 breweries with the strongest beer
df_abv['rank'] = df_abv['avg_ABV'].rank(method = 'dense', ascending = False)

# displaying the dataset and filtering out for the top 3
df_abv[df_abv['rank'] <= 3]

Unnamed: 0,beer_brewerId,avg_ABV,rank
784,6513,19.23,1.0
175,736,13.75,2.0
1644,24215,12.47,3.0
