In [None]:
import pandas as pd
import numpy as np
import bs4 as bs
import requests

# get the ratings while preserving the link to the reviews
url = 'http://www.theramenrater.com/resources-2/the-list/'
r = requests.get(url)
sp = bs.BeautifulSoup(r.content, 'lxml')
tb = sp.find_all('table')[0] 
df = pd.read_html(str(tb),encoding='utf-8', attrs = {'id': 'myTable'}, header=0)[0]
df['href'] = [tag.get('href') for tag in tb.find_all('a')]
df.set_index('Review #', inplace=True)

In [None]:
# helper function to find visible text in webpage
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, bs.Comment):
        return False
    return True

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag.perceptron import PerceptronTagger
from nltk import word_tokenize
import nltk
import string
import re
from numba import jit


def remove_features(word, tagger, lmtzr):
    """Returns a word after it has been checked to see if it is worth keeping"""
    nltk.data.path.append("/opt/gmi/bd_userapps/shared/nltk_data")
    function_list = [remove_stop_words, remove_puncuation, remove_numbers, filter_tag_pos, lemmatize_word,
                     remove_short_words]
    # lowercase
    word = word.lower()
    # iterate through functions and stop if the word gets thrown out
    for func in function_list:
        if func == filter_tag_pos:
            word, tagged_text = func(word, tagger)
        elif func == lemmatize_word:
            word = func(tagged_text, lmtzr)
        else:
            word = func(word)
        if word.isspace() or word == '':
            break
    return word


@jit
def filter_tag_pos(word, tagger):
    """Tag Part of Speach keep only verbs, nouns and adjectives"""
    # noun tags
    nn_tags = ['NN', 'NNP', 'NNP', 'NNPS', 'NNS']
    # adjectives
    jj_tags = ['JJ', 'JJR', 'JJS']
    # verbs
    vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    nltk_tags = nn_tags + jj_tags + vb_tags
    tagged_text = tagger.tag([word])
    # word & tag tuple
    if tagged_text[0][1] not in nltk_tags:
        word = ''
    return word, tagged_text


@jit
def lemmatize_word(tagged_text, lmtzr):
    if tagged_text[0][1][0].lower() == 'v':
        word = lmtzr.lemmatize(tagged_text[0][0], pos='v')
    elif tagged_text[0][1][0].lower() == 'n':
        word = lmtzr.lemmatize(tagged_text[0][0], pos='n')
    else:
        word = tagged_text[0][0]
    return word


@jit
def remove_short_words(word):
    if len(word) < 3:
        word = ''
    return word


@jit
def remove_stop_words(word):
    """take a word and check it against the common stop words list from NLTK"""
    stops = set(stopwords.words("english"))
    if word in stops:
        word = ''
    return word


@jit
def remove_puncuation(word):
    # compile regex
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation
    word = punc_re.sub('', word)
    return word


@jit
def remove_numbers(word):
    # compile regex
    num_re = re.compile('(\\d+)')
    # remove numbers
    word = num_re.sub('', word)
    return word


def text_cleaner(raw_str, tagger, lmtzr):
    """Returns a cleaned row after removing words not needed"""
    clean_words = []
    for word in word_tokenize(raw_str):
        clean_words.append(remove_features(word, tagger, lmtzr))
    clean_str = " ".join(map(str, clean_words))
    # remove redundant spaces
    clean_str = re.sub('\s\s+', ' ', clean_str)
    return clean_str

In [None]:
import time

# init the nltk objects once
lmtzr = WordNetLemmatizer()
tagger = PerceptronTagger()

for row in df[['href']].itertuples():
    try:
        page = requests.get(row[1])
        # get entry text
        soup = bs.BeautifulSoup(page.content, 'html.parser')
        entry = soup.findAll("div", {"class": "entry-content"})[0].findAll(text=True)
        visible_texts = filter(tag_visible, entry)  
        text_str = " ".join(t.strip() for t in visible_texts)
        # remove extra site specific repeated text
        stop_phrases = ['Like this: Like',  'Loading...', 'See more related reviews', 'Spread the love', '( click to enlarge )']
        for stop in stop_phrases:
            text_str = text_str.replace(stop, '')
        df.loc[row.Index, 'raw_review'] = text_str
        # begin NLP preprocessing
        clean_str = text_cleaner(text_str, tagger, lmtzr)
        df.loc[row.Index, 'clean_review'] = clean_str
        # unique the string for per company/country type analysis
        unique_str = text_cleaner(clean_str, tagger, lmtzr)
        df.loc[row.Index, 'clean_unique_review'] = unique_str
        
    except:
        # something went wrong, just fill with empty string
        df.loc[row.Index, 'raw_review_text'] = ''
        df.loc[row.Index, 'clean_review_text'] = ''
        df.loc[row.Index, 'clean_unique_review'] = ''
    finally:
        # don't hammer the site
        time.sleep(2)

In [None]:
df.to_csv('../datasets/ramen_ratings_reviews.txt.gz', sep='\t', compression='gzip')