In [None]:
import json
import re
import bs4
import pymongo
from requests import Session

# Define our wine

In [None]:
years = range(2004, 2016)
wines = [
    # Bordeaux
    {
        "name": "Haut Brion",
        "area": "Bordeaux",
        "lwin": 1011247
    },
    {
        "name": "Lafite Rothschild",
        "area": "Bordeaux",
        "lwin": 1011872
    },
    {
        "name": "Latour",
        "area": "Bordeaux",
        "lwin": 1012316
    },
    {
        "name": "Margaux",
        "area": "Bordeaux",
        "lwin": 1012781
    },
    {
        "name": "Mouton Rothschild",
        "area": "Bordeaux",
        "lwin": 1013544
    },
    # Burgundy
    {
        "name": "Rousseau, Chambertin",
        "area": "Burgundy",
        "lwin": 1057005
    },
    {
        "name": "Vogue, Musigny Vv",
        "area": "Burgundy",
        "lwin": 1026872
    },
    {
        "name": "Grivot, Clos Vougeot",
        "area": "Burgundy",
        "lwin": 1035580
    },
    {
        "name": "Lambrays, Clos Lambrays",
        "area": "Burgundy",
        "lwin": 1040290
    },
    {
        "name": "Ponsot, Clos Roche Vv",
        "area": "Burgundy",
        "lwin": 1051508
    },
    # Southern Rhone
    {
        "name": "Beaucastel, Chateauneuf Du Pape",
        "area": "Southern Rhone",
        "lwin": 1108387
    },
    {
        "name": "Clos Papes, Chateauneuf Du Pape",
        "area": "Southern Rhone",
        "lwin": 1110487
    },
    {
        "name": "Janasse, Chateauneuf Du Pape Vv",
        "area": "Southern Rhone",
        "lwin": 1113970
    },
    {
        "name": "Pegau, Chateauneuf Du Pape Reservee",
        "area": "Southern Rhone",
        "lwin": 1115118
    },
    {
        "name": "Vieux Telegraphe, Chateauneuf Du Pape",
        "area": "Southern Rhone",
        "lwin": 1118076
    },
    # Northern Rhone
    {
        "name": "Chapoutier, Ermitage Pavillon",
        "area": "Northern Rhone",
        "lwin": 1109704
    },
    {
        "name": "Domaine Jean Louis Chave, Hermitage",
        "area": "Northern Rhone",
        "lwin": 1110012
    },
    {
        "name": "Guigal, Cotes Du Rhone",
        "area": "Northern Rhone",
        "lwin": 1113101
    },
    {
        "name": "Jaboulet, Hermitage Chapelle",
        "area": "Northern Rhone",
        "lwin": 1113563
    },
    {
        "name": "Cote Rotie Ampuis",
        "area": "Northern Rhone",
        "lwin": 1113172
    }
]

## Perform searches and get data

In [None]:
def clean_search_string(string):
    s = re.sub(r"[^(\w| )]", "", string)
    return s.replace(" ", "+").lower()

def clean_review_text(string):
    #s = re.sub(r"[^(\w| |-|.|,|!|?)]", "", string)
    return string.replace("\n", "").replace("\x85", "...")

def get_html(url, session):
    r = session.get(url)
    if r.status_code == 200:
        b = bs4.BeautifulSoup(r.content, "html.parser")
    else:
        print(r.url)
        print(r.content)
        raise ValueError(r.status_code)    
    return b

def get_info(b):            
    # get review data
    try:
        rating = float(b.find("span", {"itemprop": "ratingValue"}).contents[0])
    except AttributeError:
        rating = None
    
    # get pricing data
    try:
        p = b.find("span", {"class": "dtlbl sidepanel-text"}).text.strip().replace(",", "")
        price = float(p.split("£")[1])
    except (AttributeError, IndexError):
        price = None
        
    return (rating, price)

review_data = []
headers = {
    "User-Agent":
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36"
}
s = Session()
s.headers.update(headers)
base_url = "http://www.wine-searcher.com/find/{name}/{year}"
for wine in wines:
    for year in years:
        url = base_url.format(name=clean_search_string(wine["name"]), year=year)
        soup = get_html(url, s)
        
        # get price data
        try:
            p = soup.find("span", {"class": "dtlbl sidepanel-text"}).text.strip().replace(",", "")
            price = float(p.split("£")[1])
        except (AttributeError, IndexError):
            price = None
            
        # get review data
        try:
            avg_rating = float(soup.find("span", {"itemprop": "ratingValue"}).contents[0])
        except AttributeError:
            avg_rating = None
        
        # get url for review page
        soup = get_html(url, s)
        link_div = soup.find_all("div", {"class": "s-tab"})[1]
        review_url = link_div.find("a")["href"]
        
        # pull out reviews
        soup = get_html(review_url, s)
        review_html = soup.find_all("div", {"class": "list-row-inner"})
        
        # basic wine information
        wine_info = {
            "lwin": wine["lwin"],
            "name": wine["name"],
            "region": wine["area"],
            "year": year,
            "price": price,
            "avg_rating": avg_rating,
            "reviews": []
        }
        
        for rev in review_html:
            # get review data
            try:
                r = {
                    "rating": {
                        "min": float(rev.find("meta", {"itemprop": "worstRating"})["content"]),
                        "max": float(rev.find("meta", {"itemprop": "bestRating"})["content"]),
                        "score": float(rev.find("meta", {"itemprop": "ratingValue"})["content"])
                    },
                    "reviewer": {
                        "name": rev.find("span", {"itemprop": "name"}).text,
                        "url": rev.find("a")["href"]
                    }
                }
                
                # get review text
                try:
                    r["text"] = clean_review_text(rev.find("div", {"class": "crt-note"}).text)
                except AttributeError:
                    r["text"] = None
            except TypeError:
                r = None
            
            if r is not None:
                wine_info["reviews"].append(r)
                
        review_data.append(wine_info)

In [None]:
# write out text file
with open("review_data.json", "w") as out_file:
    json.dump(review_data, out_file)

# Load review data into MongoDB

In [None]:
client = pymongo.MongoClient("mongodb://group:group@ds029635.mlab.com:29635/fods-seven")
db = client["fods-seven"]
db.drop_collection("reviews")
db.reviews.insert_many(review_data)

# Try out some text mining

In [None]:
with open("review_data.json", "r") as in_file:
    review_data = json.load(in_file)

def clean_text(text, stops):
    # replace truncated reviews
    text = text.replace("(full tasting notes on the WCI website).", " (truncated).")
    text = re.sub("-", " ", text)  # remove hyphens from words-like-this
    return [word for word in re.sub("([^(\w| )]|\d+|\(|\))", "", text).lower().split(" ")
                 if len(word) > 0
                 and word not in stops
                 and not word.startswith("(truncated)")]


with open("stop_words.txt", "r") as in_file:
    stoplist = set(map(str.rstrip, in_file.readlines()))

# prepare data
reviews = []
for r_d in review_data:
    wine_reviews = []
    for r in r_d["reviews"]:
        if r["text"] is not None:
            # clean up text
            words = clean_text(r["text"], stoplist)
            wine_reviews.extend(words)
    if len(wine_reviews) > 0:
        reviews.append(wine_reviews)

In [None]:
import gensim

# construct tfidf model
dictionary = gensim.corpora.Dictionary(reviews)
dictionary.save("review_text_data_gensim.dict")
corpus = [dictionary.doc2bow(review) for review in reviews]
model = gensim.models.TfidfModel(corpus)

def get_top_words(bow, model):
    ratios = sorted(model[bow], key=lambda x: x[1], reverse=True)
    return [(dictionary[w[0]], w[1]) for w in ratios]

# add top tfidf words to data
NUM_TOP_WORDS = 12
for r_d in review_data:
    wine_words = []
    for r in r_d["reviews"]:
        if r["text"] is not None:
            wine_words.extend(clean_text(r["text"], stoplist))
    bow = dictionary.doc2bow(wine_words)
    sorted_words = sorted(get_top_words(bow, model),
                          key=lambda x: x[1], reverse=True)
    r_d["top_words"] = []
    if len(sorted_words) >= NUM_TOP_WORDS:
        for i in range(0, NUM_TOP_WORDS):
            r_d["top_words"].append(sorted_words[i][0])