<h1>COMP 47670 Assignment 2 </h1>

<h2>Brian Delaney - 09513574 </h2>

<h3>Required Imports</h3>

In [42]:
import requests
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score


<h2>Q1 - Select Two Review Categories and Scrape</h2>

<h3>Website Base Address</h3>

In [31]:
base_address = "http://mlg.ucd.ie/modules/yalp/"

<h3>Generic Cleaning Functions</h3>

<h4>Set up the required downloads for cleaning the data - we can call this straight away</h4>

In [16]:
def setup_clean():
    nltk.download('punkt')
    nltk.download('stopwords')
    
setup_clean()

[nltk_data] Downloading package punkt to C:\Users\Brian
[nltk_data]     Delaney\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Brian
[nltk_data]     Delaney\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h4>Take a raw text string and tokenize it using the nlkt word tokenizer</h4>

In [17]:
def tokenise(rawtext):
    return nltk.word_tokenize(rawtext)

<h4>Normalise are tokens by making them all lowercase</h4>

In [18]:
def normalise(tokens):
    normalised = []
    for token in tokens:
        normalised.append(token.lower())
    return normalised

<h4>Remove the standard english stop words using the built in nlkt stop words list</h4>

In [19]:
def removeStopWords(tokens):
    stopwds = stopwords.words('english')
    removed = []
    [removed.append(token) for token in tokens if token not in stopwds]
    return removed

<h4>Stem our words using nltk porter stemmer to further be able to extract meaning from the words</h4>

In [20]:
def stem(tokens):
    stemmed = []
    stemmer = PorterStemmer()
    [stemmed.append(stemmer.stem(word)) for word in tokens]
    return stemmed

<h4>Define a cleaning pipeline using our cleaning functions that takes some raw text and returns a list of cleaned tokens</h4>

In [21]:
def clean_text(raw_text):
    tokens = tokenise(raw_text)
    tokens = normalise(tokens)
    tokens = removeStopWords(tokens)
    tokens = stem(tokens)
    return tokens

<h3>Generic Scraping Functions</h3>

<h4>Return Content of Page using requests lib</h4>

In [22]:
def get_page(url):
    try:
        r = requests.get(url)
        if(r.status_code == 200):
            return r.content
        else:
            return None
    except Exception as e:
        print("Error Scraping {0} : {1}".format(url, e))

<h4>Scrape the location links for a given location type given the base page soup </h4>

In [23]:
def scrape_location_links(soup):
    location_links = soup.find_all('a')

    full_location_links = []
    for location_link in location_links:
        full_location_link = base_address + location_link['href']
        full_location_links.append(full_location_link)

    return full_location_links

<h4>Scrape the reviews for a given location link </h4>

In [24]:
def scrape_reviews_for_location_link(location_link):
    content = get_page(location_link)
    soup = BeautifulSoup(content, features='html.parser')

    all_reviews = []
    reviews = soup.find_all('div', 'review')
    for review in reviews:
        review_data = extract_review_data(review)
        all_reviews.append(review_data)

    return all_reviews

<h4>Extract the review score from the stars image using regular expressiosn</h4>

In [25]:
def extract_review_score_from_image(review_score_img):
    return re.findall(r'stars-([0-9]).png', review_score_img['src'])[0]

<h4>Convert the numeric star rating into a positive (1) or negative (0) review</h4>

In [26]:
def num_stars_to_postive_or_negative(num_stars):
    num_stars = int(num_stars)
    if(num_stars > 0 and num_stars <= 3):
        return 0
    elif(num_stars > 3 and num_stars <= 5):
        return 1
    else:
        raise Exception("Invalid Review Rating")

<h4> Extract the review data from the review page soup </h4>

In [27]:
def extract_review_data(review_soup):
    review_text = review_soup.find('p', 'text').getText()
    review_score_img = review_soup.find('img')

    review_score = extract_review_score_from_image(review_score_img)
    review_class = num_stars_to_postive_or_negative(review_score)

    return {"review_class" : review_class, "review_text" : review_text}


<h4>Function that when given a base type page will scrape the review data, clean the review and return a list of dicts.
Each dict has a "review_class" representing whether the review was positive or negative and a "review_text" which
represents a cleaned list of tokens from the review.</h4>

In [28]:
def get_and_clean_review_data(start_page):
    content = get_page(start_page)
    soup = BeautifulSoup(content, features='html.parser')
    location_links = scrape_location_links(soup)

    all_reviews_for_type = []
    for location_link in location_links:
        all_review_data_for_location = scrape_reviews_for_location_link(location_link)
        for review_data in all_review_data_for_location:
            cleaned_review_text = clean_text(review_data["review_text"])
            all_reviews_for_type.append({"review_class" : review_data["review_class"], "review_text": cleaned_review_text})

    return all_reviews_for_type

<h3>Start Scrape and Clean </h3>

In [32]:
bar_start_page = "http://mlg.ucd.ie/modules/yalp/bars_list.html"
restaurant_start_page = "http://mlg.ucd.ie/modules/yalp/restaurants_list.html"

In [33]:
cleaned_bar_data = get_and_clean_review_data(bar_start_page)
cleaned_restaurant_data = get_and_clean_review_data(restaurant_start_page)

<h2>Q2 - Create numeric represntitons, train classifier and evaluate</h2>

<h3>Category 1 - Bars</h3>

<h4>Sperate out bar data into category and review content lists</h4>

In [35]:
review_classes = []
review_text = []

for cleaned_data in cleaned_bar_data:
    review_text.append(cleaned_data["review_text"])
    review_classes.append(cleaned_data["review_class"])

<h4>Create tfidf_vectorizor and arrange our data in the correct numeric format </h4>

In [36]:
tfidf = TfidfVectorizer(preprocessor=' '.join, stop_words='english')

y = np.array(review_classes)
X = tfidf.fit_transform(review_text).toarray()

In [39]:
print(y.shape)
print(X.shape)

(1460,)
(1460, 7005)


<h4>Split our data into train and test sets and train a RandomForest Ensemble Classifier on the training set</h4>

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

<h4>Evaluate using test set</h4>

In [41]:
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 62  42]
 [  9 179]]
              precision    recall  f1-score   support

           0       0.87      0.60      0.71       104
           1       0.81      0.95      0.88       188

   micro avg       0.83      0.83      0.83       292
   macro avg       0.84      0.77      0.79       292
weighted avg       0.83      0.83      0.82       292

0.8253424657534246


<h4> Evaluate using cross validation</h4>

In [43]:
scores = cross_val_score(classifier, X, y, cv=10)
print(scores)
scores = pd.Series(scores)
print(scores.mean(), " =/- ", scores.std())

[0.9047619  0.84353741 0.82993197 0.7414966  0.82312925 0.86206897
 0.84137931 0.84137931 0.8        0.84137931]
0.8329064039408868  =/-  0.04201200978068417


<h3>Category 2 - Restaurants</h3>

<h4>Sperate out bar data into category and review content lists</h4>

In [44]:
review_classes = []
review_text = []

for cleaned_data in cleaned_restaurant_data:
    review_text.append(cleaned_data["review_text"])
    review_classes.append(cleaned_data["review_class"])

<h4>Create tfidf_vectorizor and arrange our data in the correct numeric format </h4>

In [45]:
tfidf = TfidfVectorizer(preprocessor=' '.join, stop_words='english')

y = np.array(review_classes)
X = tfidf.fit_transform(review_text).toarray()

In [46]:
print(y.shape)
print(X.shape)

(1440,)
(1440, 6485)


<h4>Split our data into train and test sets and train a RandomForest Ensemble Classifier on the training set</h4>

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

<h4>Evaluate using test set</h4>

In [48]:
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 56  36]
 [  9 187]]
              precision    recall  f1-score   support

           0       0.86      0.61      0.71        92
           1       0.84      0.95      0.89       196

   micro avg       0.84      0.84      0.84       288
   macro avg       0.85      0.78      0.80       288
weighted avg       0.85      0.84      0.84       288

0.84375


<h4> Evaluate using cross validation</h4>

In [49]:
scores = cross_val_score(classifier, X, y, cv=10)
print(scores)
scores = pd.Series(scores)
print(scores.mean(), " =/- ", scores.std())

[0.83448276 0.77241379 0.8137931  0.84027778 0.84722222 0.84722222
 0.81944444 0.7972028  0.76223776 0.85314685]
0.8187443734426493  =/-  0.03222396466328304


<h2>Q3 - Evaluate how well the classifiers transfer between categories</h2>

In [50]:
print(len(cleaned_bar_data))
print(len(cleaned_restaurant_data))

1460
1440


<h4>Combine data into a single data set</h4>

In [51]:
cleaned_data = []

for cleaned_bar in cleaned_bar_data:
    cleaned_data.append(cleaned_bar)
    
for cleaned_restaurant in cleaned_restaurant_data:
    cleaned_data.append(cleaned_restaurant)
    
print(len(cleaned_data))

2900


<h4>Split the data into target values and review content as before</h4>

In [52]:
review_classes = []
review_text = []

for data in cleaned_data:
    review_text.append(data["review_text"])
    review_classes.append(data["review_class"])

<h4>Create numeric tfidf representation of our data set</h4>

In [53]:
tfidf = TfidfVectorizer(preprocessor=' '.join, stop_words='english')

y = np.array(review_classes)
X = tfidf.fit_transform(review_text).toarray()

<h4>Train model A using only the first 1460 reviews i.e. only the bar data</h4>

In [55]:
X_bar = X[:1460]
y_bar = y[:1460]

bar_classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
bar_classifier.fit(X_bar, y_bar)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

<h4>Train model A using only the last 1440 reviews i.e. only the restaurant data</h4>

In [56]:
X_restaurant = X[1440:]
y_restaurant = y[1440:]

restaurant_classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
restaurant_classifier.fit(X_restaurant, y_restaurant)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

<h4>Test the performance of the bar reviews on the classifier trained on the restaurant data only </h4>

In [57]:
y_bar_pred = restaurant_classifier.predict(X_bar)

print(confusion_matrix(y_bar, y_bar_pred))
print(classification_report(y_bar, y_bar_pred))
print(accuracy_score(y_bar, y_bar_pred))

[[326 239]
 [ 28 867]]
              precision    recall  f1-score   support

           0       0.92      0.58      0.71       565
           1       0.78      0.97      0.87       895

   micro avg       0.82      0.82      0.82      1460
   macro avg       0.85      0.77      0.79      1460
weighted avg       0.84      0.82      0.81      1460

0.8171232876712329


<h4>Test the performance of the restaurant reviews on the classifier trained on the bar data only </h4>

In [58]:
y_rest_pred = bar_classifier.predict(X_restaurant)

print(confusion_matrix(y_restaurant, y_rest_pred))
print(classification_report(y_restaurant, y_rest_pred))
print(accuracy_score(y_restaurant, y_rest_pred))

[[324 188]
 [ 47 901]]
              precision    recall  f1-score   support

           0       0.87      0.63      0.73       512
           1       0.83      0.95      0.88       948

   micro avg       0.84      0.84      0.84      1460
   macro avg       0.85      0.79      0.81      1460
weighted avg       0.84      0.84      0.83      1460

0.839041095890411
