# Import required packages

In [344]:
from urllib import request, parse

import bs4

import pandas as pd

# import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score

from gensim.parsing.preprocessing import preprocess_documents
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

import matplotlib.pyplot as plt
%matplotlib inline


# Define needed functions for scraping

In [8]:
# To get links
def getHtmlParser(link):
    response = request.urlopen(link)
    html = response.read().decode()
    parser = bs4.BeautifulSoup(html,"html.parser")
    return parser

In [9]:
def getSelector(parser, selector):
    result = parser.select(selector)
    return result

In [29]:
def getLinks(link, selector):
    links = []
    parser = getHtmlParser(link)
    anchors = getSelector(parser, selector)
    for a in anchors:
        a_href = (a['href'])
        links.append(a_href)
    return links

# Scrape the main page to get categories link

In [346]:
mainPage = "http://mlg.ucd.ie/modules/yalp/"

categories = getLinks(mainPage, 'div.category a[href]')
print("Categories url:", categories)

Categories url: ['automotive_list.html', 'cafes_list.html', 'fashion_list.html', 'gym_list.html', 'hair_salons_list.html', 'hotels_list.html', 'restaurants_list.html']


# Get categories reviews and store to json files

In [75]:
for category in categories[:3]:    
    reviewTexts = []
    reviewRates = []
    category_json = category.replace('.html', '.json')
    reviewUrl = parse.urljoin(mainPage, category)
    reviewPages = getLinks(reviewUrl, 'h5 a[href]')
    
    # Visit all category sub-links to get reviews
    for page in reviewPages:
#         print("..", page)
        parser = getHtmlParser(parse.urljoin(mainPage, page))
        reviews = parser.select("div.review")

        for review in reviews:
            reviewText = review.find('p', {'class': 'review-text'}).get_text()
            reviewTexts.append(reviewText)
            
            img = review.select('p.rating img[src]')[0]
            reviewRates.append(img['src'])
    
    # Show the number of reviews
    print(category, len(reviewTexts))
    
    # Put category reviews in dataframe
    df = pd.DataFrame(data = {'text': reviewTexts, 'rating': reviewRates })
    
    # Convert rating url to integer
    df['rating'] = df['rating'].str.extract('(\d)').astype('int8')
    
    # Create positive/negative classes from rating
    df['positive_review'] = df['rating'] > 3
    
    # Save dataframe
    df.to_json(category)

automotive_list.html
2000
cafes_list.html
2000
fashion_list.html
2000


# Retreive categories reviews

In [78]:
categories_json = [c.replace('.html', '.json') for c in categories]

df_A = pd.read_json(categories_json[0])
df_B = pd.read_json(categories_json[1])
df_C = pd.read_json(categories_json[2])

In [79]:
print(df_A.shape)
print(df_B.shape)
print(df_C.shape)

(2000, 3)
(2000, 3)
(2000, 3)


In [122]:
df_A.head(10)

Unnamed: 0,text,rating,positive_review
0,The man that was working tonight (8-12-17) was...,1,False
1,Chris is a very rude person. Gave me an attitu...,1,False
2,One of my favorite gas station to stop at. The...,5,True
3,Oh thank Heaven for Seven Eleven! I don't know...,3,False
4,Five stars because of the guy who works weekda...,5,True
5,I had the best experience here!!!!!! All the e...,5,True
6,My objective with this shop was to find a good...,1,False
7,Absolute worst service ever. Went in for oil c...,1,False
8,Love love love their customer service. I come ...,5,True
9,I took my friend to this shop to get her oil c...,1,False


# Preprocessing

In [417]:
def prepross_data(df_text):
    dataset = preprocess_documents(df_text)
    
    dct = Dictionary(dataset)  # fit dictionary
    corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format
    model = TfidfModel(corpus)

    # To retrieve the same pd.DataFrame format.
    documents_tfidf_lol = [{dct[word_idx]:tfidf_value for word_idx, tfidf_value in sent} for sent in model[corpus]]
    documents_tfidf = pd.DataFrame(documents_tfidf_lol)
    documents_tfidf = documents_tfidf.fillna(0.0)
    
    return (documents_tfidf, dct, model, dataset)

In [422]:
tfidf_A, dct_A, tfidf_model_A, dataset_A = prepross_data(df_A['text'])
tfidf_B, dct_B, tfidf_model_B, dataset_B = prepross_data(df_B['text'])
tfidf_C, dct_C, tfidf_model_C, dataset_C = prepross_data(df_C['text'])

# Classification
- divide to (80% / 20%) for train and test data sets respectively 
- training 
- predict 
- evaluate 

In [419]:
def classify(tfidf, y):
    # divide to (80% / 20%) for train and test data sets respectively  
    X_train, X_test, y_train, y_test = train_test_split(tfidf, y
                                                        , test_size=0.2, random_state = 42)
    # training
    clf = ComplementNB().fit(X_train, y_train)

    # predict
    predicted = clf.predict(X_test)

    # evaluate
    return (clf, accuracy_score(y_test, predicted))

In [420]:
model_A, acc_A = classify(tfidf_A, df_A['positive_review'])
model_B, acc_B = classify(tfidf_B, df_B['positive_review'])
model_C, acc_C = classify(tfidf_C, df_C['positive_review'])

In [421]:
print("Accuracies:")
print("Category A:", acc_A)
print("Category B:", acc_B)
print("Category C:", acc_C)

Accuracies:
Category A: 0.92
Category B: 0.825
Category C: 0.8525


In [442]:
def transform_data(dataset, dct, model):
    
    corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format

    # To retrieve the same pd.DataFrame format.
    documents_tfidf_lol = [{dct[word_idx]:tfidf_value for word_idx, tfidf_value in sent} for sent in model[corpus]]
    documents_tfidf = pd.DataFrame(documents_tfidf_lol)
    documents_tfidf = documents_tfidf.fillna(0.0)
    
    return documents_tfidf

In [443]:

# to transform other datasets
# ---------------------------
# X_new_counts = count_vect.transform(X_test)
# X_new_tfidf = tfidf_transformer.transform(X_new_counts)



In [472]:
def predict(tfidf_1, tfidf_23, model_1, reviews_2, reviews_3):
    tfidf_23 = pd.concat([tfidf_1[:0], tfidf_23], ignore_index=True)
    tfidf_23 = tfidf_23.fillna(0.0)

    # predict
    predicted = model_1.predict(tfidf_23)

    y = reviews_2.append(reviews_3)

    return accuracy_score(y, predicted)

In [473]:
tfidf_BC = transform_data(dataset_B + dataset_C, dct_A, tfidf_model_A)
print(predict(tfidf_A, tfidf_BC, model_A, df_B['positive_review'], df_C['positive_review']))

0.8425


In [475]:
tfidf_AC = transform_data(dataset_A + dataset_C, dct_B, tfidf_model_B)
print(predict(tfidf_B, tfidf_AC, model_B, df_A['positive_review'], df_C['positive_review']))

0.822


In [476]:
tfidf_BA = transform_data(dataset_B + dataset_A, dct_C, tfidf_model_C)
print(predict(tfidf_C, tfidf_BA, model_C, df_B['positive_review'], df_A['positive_review']))

0.857


In [491]:
# most frequent features
pos_class_prob_sorted = model_A.feature_log_prob_[0,:].argsort()
neg_class_prob_sorted = model_A.feature_log_prob_[1,:].argsort()


print("Top negative features:", ', '.join(tfidf_A.columns[neg_class_prob_sorted[:10]]))
print()
print("Top positive features:", ', '.join(tfidf_A.columns[pos_class_prob_sorted[:10]]))
#.sort(key=lambda tup: tup[1])

Top negative features: car, tire, oil, told, said, chang, ask, time, dai, servic

Top positive features: great, car, servic, work, help, friendli, recommend, need, gui, price


In [488]:
import numpy as np

from sklearn.model_selection import LeavePGroupsOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 2, 1, 2, 1, 2])
groups = np.array([1, 1, 2, 2, 3, 3])
logo = LeavePGroupsOut(n_groups=len(np.unique(groups))-1)
logo.get_n_splits(X, y, groups)

logo.get_n_splits(groups=groups)  # 'groups' is always required

print(logo)
for train_index, test_index in logo.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

LeavePGroupsOut(n_groups=2)
TRAIN: [4 5] TEST: [0 1 2 3]
[[ 9 10]
 [11 12]] [[1 2]
 [3 4]
 [5 6]
 [7 8]] [1 2] [1 2 1 2]
TRAIN: [2 3] TEST: [0 1 4 5]
[[5 6]
 [7 8]] [[ 1  2]
 [ 3  4]
 [ 9 10]
 [11 12]] [1 2] [1 2 1 2]
TRAIN: [0 1] TEST: [2 3 4 5]
[[1 2]
 [3 4]] [[ 5  6]
 [ 7  8]
 [ 9 10]
 [11 12]] [1 2] [1 2 1 2]
