In [71]:
from datetime import datetime
import dateparser
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split

In [4]:
beers = pd.read_csv('../data/beer_info.csv')
reviews = pd.read_csv('../data/beer_reviews.csv')

In [9]:
beers.set_index('beer_id',inplace=True)
reviews.set_index('review_id',inplace=True)

In [16]:
reviews.head()

Unnamed: 0_level_0,beer_id,posted,ratings,score,text,username
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,"Nov 07, 2014",[],4.4,Growler fill from the Ohio Taproom. Take advan...,Swettenham_Shire
2,1,"Nov 08, 2018","[3.75, 3.25, 4.5, 4.25, 4.25]",4.08,"L: Gold, transparent. Gorgeous head with nice ...",KurtisCarman
3,1,"Oct 24, 2018","[4.0, 4.75, 4.5, 4.5, 4.5]",4.53,This is a good Fresh Hop ale! Great dank taste...,ChilliHeights
4,1,"Oct 21, 2016","[4.5, 5.0, 4.75, 4.5, 4.75]",4.77,"Bottle dated 10/17/16, poured into tulip. Mild...",jsidonis7
5,1,"Dec 18, 2016","[4.0, 4.5, 4.5, 4.0, 4.0]",4.32,"12 ounce bottle into tulip glass, bottled on 1...",superspak


In [11]:
###Cleaning Functions###

#Clean Ratings from reviews
# if string characters appears, than user did not give a ratings breakdown. so clear the line.
def clean_ratings(col):
    r = re.compile('characters')
    ratings = col
    ratings = ['' if r.search(rating) else rating for rating in ratings]
    r = re.compile('look|smell|taste|feel|overall|: ')
    ratings = [r.sub('',rating).split(' | ') for rating in ratings]
    ratings = [ra if ra == [''] else list(map(float, ra)) for ra in ratings]
    return ratings

# Clean posted date from reviews
def clean_dates(col):
    posted = col
    def parse_dates(text, origin):
        #Parses absolute and relative dates    
        try:
            date = datetime.strptime(text, '%b %d, %Y')
        except ValueError:
            date = dateparser.parse(text, settings={'RELATIVE_BASE': origin})
        return date
    scraping_date = dateparser.parse('Jul 3, 2019')
    post_cleaned = [parse_dates(post, scraping_date) for post in posted]
    return post_cleaned

In [19]:
reviews.ratings = clean_ratings(reviews.ratings)
reviews.posted = clean_dates(reviews.posted)
reviews.head(2)

Unnamed: 0_level_0,beer_id,posted,ratings,score,text,username
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,2014-11-07,[],4.4,Growler fill from the Ohio Taproom. Take advan...,Swettenham_Shire
2,1,2018-11-08,"[3.75, 3.25, 4.5, 4.25, 4.25]",4.08,"L: Gold, transparent. Gorgeous head with nice ...",KurtisCarman


In [47]:
# Clean text data
rev_text = reviews.text
rev_text = rev_text.apply(lambda x: re.sub('[^\w\s]',' ',str(x).lower())).apply(lambda x: re.sub('\s+',' ',x))
rev_text = rev_text.apply(lambda x: x.strip())

In [50]:
reviews_df = pd.DataFrame({'id': pd.Series(reviews.beer_id),
                          'text':pd.Series(rev_text)})

reviews_df.head(4)

Unnamed: 0_level_0,id,text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,growler fill from the ohio taproom take advant...
2,1,l gold transparent gorgeous head with nice lac...
3,1,this is a good fresh hop ale great dank taste ...
4,1,bottle dated 10 17 16 poured into tulip mild m...


In [123]:
testing = pd.DataFrame({'id':pd.Series(reviews.beer_id),
                       'text':pd.Series(reviews.text),
                       'score':pd.Series(reviews.score)})

In [124]:
yakima_fresh_hop = testing[testing.id==1]

In [125]:
yakima_fresh_hop.head()

Unnamed: 0_level_0,id,text,score
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,Growler fill from the Ohio Taproom. Take advan...,4.4
2,1,"L: Gold, transparent. Gorgeous head with nice ...",4.08
3,1,This is a good Fresh Hop ale! Great dank taste...,4.53
4,1,"Bottle dated 10/17/16, poured into tulip. Mild...",4.77
5,1,"12 ounce bottle into tulip glass, bottled on 1...",4.32


In [174]:
#NLP functions
def stemmer(text):
    text = re.sub('[^a-zA-Z]',' ',text).lower().split()    
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

In [None]:
cat_ipa = list(beers[beers.beer_style=='American IPA'].index)
ipa_revs = testing[testing.id.isin(cat_ipa)]

In [189]:
corpus = len(ipa_revs.text)*[None]
#181149
for i in range(len(ipa_revs.text)):
    if i%100==0:
        print(i,end='\t')
    corpus[i] = stemmer(rev)
    
#cv = CountVectorizer(max_features = 1500)
#X = cv.fit_transform(corpus).toarray()
#y = revs.values

0	100	200	300	400	500	600	700	800	900	1000	1100	1200	1300	1400	1500	1600	1700	1800	1900	2000	2100	2200	2300	2400	2500	2600	2700	2800	2900	3000	3100	3200	3300	3400	3500	3600	3700	3800	3900	4000	4100	4200	4300	4400	4500	4600	4700	4800	4900	5000	5100	5200	5300	5400	5500	5600	5700	5800	5900	6000	6100	6200	6300	6400	6500	6600	6700	6800	6900	7000	7100	7200	7300	7400	7500	7600	7700	7800	7900	8000	8100	8200	8300	8400	8500	8600	8700	8800	8900	9000	9100	9200	9300	9400	9500	9600	9700	9800	9900	10000	10100	10200	10300	10400	10500	10600	10700	10800	10900	11000	11100	11200	11300	11400	11500	

KeyboardInterrupt: 

In [182]:
len(ipa_revs.text)

181149

In [None]:
for rev in ipa_revs.text:

In [106]:
revs = yakima_fresh_hop.text
corpus = []
for rev in revs:
    corpus.append(stemmer(rev))


In [118]:
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = revs.values

In [159]:
ipa_revs

[           id                                               text  score
 review_id                                                              
 1           1  Growler fill from the Ohio Taproom. Take advan...   4.40
 2           1  L: Gold, transparent. Gorgeous head with nice ...   4.08
 3           1  This is a good Fresh Hop ale! Great dank taste...   4.53
 4           1  Bottle dated 10/17/16, poured into tulip. Mild...   4.77
 5           1  12 ounce bottle into tulip glass, bottled on 1...   4.32
 6           1  Appearance: Pours clear and amber, with a thre...   4.02
 7           1  Clear sparkling gold with a large fluffy white...   3.97
 8           1  Pours a dark orange color with decent clarity....   3.84
 9           1  12 FLUID OZ bottle\nServed in a tulip glass\n\...   4.35
 10          1  Half-growler filled at Rozi's Wine House in La...   4.25
 11          1  12oz bottle dated 10/12/18 poured into a Teku\...   4.35
 12          1  12oz Stubby\n\nBig ol' thanks to Da