# Ratebeer Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_pickle('ratebeer_clean.pkl')

In [3]:
df.head(20)

Unnamed: 0,beerID,beer_name,abv,abv_listed,brewerID,beer_style,reviewer_username,review_appearance,review_aroma,review_palate,review_taste,review_overall,review_text,review_unix_time
0,63836,John Harvards Simcoe IPA,5.4,True,8481,India Pale Ale &#40;IPA&#41;,hopdog,4,6,3,6,13,"On tap at the Springfield, PA location. Poure...",1157587200
1,63836,John Harvards Simcoe IPA,5.4,True,8481,India Pale Ale &#40;IPA&#41;,TomDecapolis,4,6,4,7,13,On tap at the John Harvards in Springfield PA...,1157241600
2,71716,John Harvards Cristal Pilsner,5.0,True,8481,Bohemian Pilsener,PhillyBeer2112,4,5,3,6,14,"UPDATED: FEB 19, 2003 Springfield, PA. I've n...",958694400
3,64125,John Harvards Fancy Lawnmower Beer,5.4,True,8481,Kölsch,TomDecapolis,2,4,2,4,8,On tap the Springfield PA location billed as ...,1157587200
4,64125,John Harvards Fancy Lawnmower Beer,5.4,True,8481,Kölsch,hopdog,2,4,2,4,8,"On tap at the Springfield, PA location. Poure...",1157587200
5,31544,John Harvards Vanilla Black Velvet Stout,6.0,False,8481,Sweet Stout,egajdzis,5,8,4,7,16,"Springfield, PA location... Poured an opaque ...",1077753600
6,71714,John Harvards American Brown Ale,6.0,False,8481,Brown Ale,hopdog,4,5,3,6,12,"On tap at the Springfield, PA location. Liste...",1176076800
7,71719,John Harvards Grand Cru,7.0,True,8481,Belgian Ale,JFGrind,2,6,3,7,14,"Sampled @ the Springfield, PA location. Can...",1107302400
8,71719,John Harvards Grand Cru,7.0,True,8481,Belgian Ale,egajdzis,4,8,3,7,16,Springfield... Poured a hazy copper color wit...,1102896000
9,71719,John Harvards Grand Cru,7.0,True,8481,Belgian Ale,PhillyBeer2112,3,8,4,8,17,"UPDATED: FEB 19, 2003 Springfield, PA. Darkis...",996105600


We get day of the week, month and day of year. Not sure if day of year will help but approximately measeures days after christmas. Keeping it in for now on the chance it might be usefull.

I used US Central time zone to calculate these as most of the reviews are from the states.

In [5]:
from datetime import datetime
from pytz import timezone
import pytz

central = timezone('US/Central')

df['weekday_central'] =  df['review_unix_time'].apply(lambda x: 
                             pytz.utc.localize(datetime.utcfromtimestamp(x)).astimezone(central).strftime("%A"))


df['day_of_year'] =  df['review_unix_time'].apply(lambda x: 
                             pytz.utc.localize(datetime.utcfromtimestamp(x)).astimezone(central).timetuple().tm_yday)


df['month'] =  df['review_unix_time'].apply(lambda x: 
                             pytz.utc.localize(datetime.utcfromtimestamp(x)).astimezone(central).strftime("%m"))







In [6]:
test = timezone('US/Central')
pytz.utc.localize(datetime.utcfromtimestamp(958694400)).astimezone(test).strftime("%Y-%m-%d %H:%M:%S")



'2000-05-18 19:00:00'

In [7]:
# We use distance in months from July as a measure of the temperature/season
df['distance_from_july'] = df['month'].apply(lambda x: abs(7-int(x)))

In [8]:
# In a typical prouction situation, you would be making predictions based on past data so we 
# create train and test sets by splitting on a date. All test data is after that date

# I have used 10% of data as test data. As we'll see later, the training set needs to be split
# again so I want it to be fairly large 

In [9]:
sorted_df=df.sort_values(by='review_unix_time')

In [10]:
df.count()

beerID                2924163
beer_name             2924163
abv                   2924163
abv_listed            2924163
brewerID              2924163
beer_style            2924163
reviewer_username     2924163
review_appearance     2924163
review_aroma          2924163
review_palate         2924163
review_taste          2924163
review_overall        2924163
review_text           2924163
review_unix_time      2924163
weekday_central       2924163
day_of_year           2924163
month                 2924163
distance_from_july    2924163
dtype: int64

In [11]:
2924163*0.9

2631746.7

In [12]:
train_df, test_df = np.split(sorted_df, [2631746])

In [13]:
train_df.count()

beerID                2631746
beer_name             2631746
abv                   2631746
abv_listed            2631746
brewerID              2631746
beer_style            2631746
reviewer_username     2631746
review_appearance     2631746
review_aroma          2631746
review_palate         2631746
review_taste          2631746
review_overall        2631746
review_text           2631746
review_unix_time      2631746
weekday_central       2631746
day_of_year           2631746
month                 2631746
distance_from_july    2631746
dtype: int64

In [14]:
test_df.count()

beerID                292417
beer_name             292417
abv                   292417
abv_listed            292417
brewerID              292417
beer_style            292417
reviewer_username     292417
review_appearance     292417
review_aroma          292417
review_palate         292417
review_taste          292417
review_overall        292417
review_text           292417
review_unix_time      292417
weekday_central       292417
day_of_year           292417
month                 292417
distance_from_july    292417
dtype: int64

In [15]:
train_df.head()

Unnamed: 0,beerID,beer_name,abv,abv_listed,brewerID,beer_style,reviewer_username,review_appearance,review_aroma,review_palate,review_taste,review_overall,review_text,review_unix_time,weekday_central,day_of_year,month,distance_from_july
2291588,121,Stone City Hefeweizen,6.0,False,23,German Hefeweizen,lazarus99,5,8,4,10,20,I love this kind of beer. It seems to be the ...,955497600,Tuesday,102,4,3
1651460,132,Newcastle Brown Ale,4.7,True,751,Brown Ale,billb,5,6,5,9,17,This has to be one of my favorite drinking be...,955497600,Tuesday,102,4,3
2146614,286,Worthington White Shield,5.6,True,12871,Premium Bitter/ESB,wade,3,10,4,9,18,Excellent ale with a strong aroma and initial...,955843200,Saturday,106,4,3
2182000,544,Saranac Pale Ale,5.5,True,92,English Pale Ale,billb,4,7,5,9,17,Pretty tasty for a pale ale. This is one of ...,956448000,Saturday,113,4,3
2171356,547,Saranac Black and Tan,5.1,True,92,Stout,billb,3,7,2,6,10,I a big fan of Black and Tans but this one is...,956448000,Saturday,113,4,3


In [16]:
test_df.head()

Unnamed: 0,beerID,beer_name,abv,abv_listed,brewerID,beer_style,reviewer_username,review_appearance,review_aroma,review_palate,review_taste,review_overall,review_text,review_unix_time,weekday_central,day_of_year,month,distance_from_july
2081618,136550,Lost Abbey Sinners 10,8.0,True,7043,Sour Ale/Wild Ale,MrChopin,2,5,2,6,9,Bottle opens with the sound of repressed fart...,1306713600,Sunday,149,5,2
1729409,23373,Brewbakers Surf City Blonde Ale,6.0,False,390,Golden Ale/Blond Ale,bartalone,3,3,2,5,7,"Not a strong flavor mild, very very mild mu...",1306713600,Sunday,149,5,2
2281210,48725,Port Washington Hop-2-It IPA,6.0,True,1336,India Pale Ale &#40;IPA&#41;,nuplastikk,3,7,3,7,14,Sample at Hops Haven. Hazy orange tan. Thin...,1306713600,Sunday,149,5,2
1122846,101199,Founders Cerise,6.5,True,554,Fruit Beer,hoosiers1994,4,6,3,6,11,12 oz bottle poured into a tulip. Aroma of b...,1306713600,Sunday,149,5,2
1238760,145953,Muirhouse Magnum Mild,4.5,True,11110,Mild Ale,maeib,4,8,3,7,15,Cask conditioned\tgravity dispense\tNorthants...,1306713600,Sunday,149,5,2


In [17]:
train_df.to_pickle(path='./train_set.pkl')

In [18]:
test_df.to_pickle(path='./test_set.pkl')