In [297]:
# Import the libraries needed
from bs4 import BeautifulSoup, SoupStrainer
from collections import Counter
import httplib2
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
import urllib.request
from urllib.request import urlopen, Request
import random
import re
import requests

In [298]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [299]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

## 3.2 Dataset Analysis

### Tokenisation and Stemming

In [300]:
# get reviews for a random business 
random_business = reviews.sample()
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
1800,dgNAX385Zmzcx_IBQDw6yQ,VHbwSPBmxaDC6SfJ5wCjIA,CQxhyLM833WF45yrS9yeAA,5,0,0,0,I couldn't have asked for a better experience ...,2016-03-07 00:59:06
1839,TE2VTDyXwb0wRuxhmq1PQw,xSDbXCgHXDVruZ1HfCCDNQ,CQxhyLM833WF45yrS9yeAA,5,1,0,0,I could not have asked for a more peaceful pas...,2018-08-14 15:15:36
1844,bJpeLYJwKfXaxbogQ6m7pg,3cZEJcR2HTPQD_IVsRG2Lg,CQxhyLM833WF45yrS9yeAA,5,0,0,0,After 14 years of an amazing relationship my 1...,2017-11-07 16:28:44
1845,I8Ww_eX6Zuc_nGC7zi-rwg,Bfsq2cmTknhnD5wFhqq0iA,CQxhyLM833WF45yrS9yeAA,5,1,0,0,This was such a heart wrenching decision to ma...,2017-09-09 16:25:16
1858,W4ho5REhO0S5MLrVLcx8Rw,9R66qSElDudAvga9x91D9Q,CQxhyLM833WF45yrS9yeAA,5,2,0,0,I will never use any other method again but in...,2016-11-22 21:27:03


In [301]:
small_business_dataset_reviews = list(small_business_dataset['text'])

In [302]:
# convert the reviews into a concatenated string 
b1_review = ''.join(small_business_dataset_reviews)
clean_review = re.sub(r"[^A-Za-z0-9\s]+", "", b1_review)
b1_review = nlp(clean_review)

In [303]:
# removed punctuation and get the top 10 most common words (including stopwords)
b1_review_words = [token.text for token in b1_review if token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('and', 631), ('to', 544), ('the', 484), ('was', 305), ('I', 297), ('a', 263), ('of', 201), ('for', 187), ('our', 162), ('Dr', 161)]


In [304]:
# removed punctuation and get the top 10 most common words (excluding stopwords)
b1_review_words = [token.text for token in b1_review if token.is_stop != True and token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('Dr', 161), ('Fixler', 150), ('time', 104), ('Happy', 75), ('Endings', 70), ('nt', 64), ('home', 60), ('decision', 50), ('dog', 47), ('pet', 43)]


In [305]:
#TODO: plot log graph


In [306]:
# now we do some stemming after removing the stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

porter_st = PorterStemmer()
lancaster_st = LancasterStemmer()
snow_st = SnowballStemmer("english")

In [307]:
# Using Porter Stemmer
porter_stemmed_words = [porter_st.stem(word) for word in b1_review_words]
porter_freq = Counter(porter_stemmed_words)
porter_common = porter_freq.most_common(10)
print(porter_common)

[('dr', 163), ('fixler', 162), ('time', 110), ('end', 101), ('happi', 87), ('pet', 68), ('nt', 64), ('dog', 63), ('thank', 62), ('home', 60)]


In [308]:
# Using Lancaster Stemmer
lancaster_stemmed_words = [lancaster_st.stem(word) for word in b1_review_words]
lancaster_freq = Counter(lancaster_stemmed_words)
lancaster_common = lancaster_freq.most_common(10)
print(lancaster_common)

[('dr', 162), ('fixl', 162), ('tim', 110), ('end', 102), ('happy', 87), ('pet', 68), ('nt', 64), ('dog', 63), ('thank', 62), ('hom', 60)]


In [309]:
# Using Snowball Stemmer
snow_stemmed_words = [snow_st.stem(word) for word in b1_review_words]
snow_freq = Counter(snow_stemmed_words)
snow_common = snow_freq.most_common(10)
print(snow_common)

[('dr', 162), ('fixler', 162), ('time', 110), ('end', 101), ('happi', 87), ('pet', 68), ('nt', 64), ('dog', 63), ('thank', 62), ('home', 60)]


### POS Tagging

In [310]:
random_sentences = reviews.sample(5, random_state=42)
random_sentences = list(random_sentences['text'])

In [311]:
random_sentences[0]

"Que ce soit pour leurs délicieux bubbles tea/smooties, leurs ''Bánh mì'' , leurs petits snacks (viennoiseries, tapioca, ...), on adore Vua et aussi leurs prix très abordables. On y retourne lorsqu'on est dans le Quartier Latin !"

In [312]:
nltk_tagged = []
for sentence in random_sentences:
    nltk_tagged.append((nltk.pos_tag(word_tokenize(sentence))))
nltk_tagged

[[('Que', 'NNP'),
  ('ce', 'NN'),
  ('soit', 'VBD'),
  ('pour', 'JJ'),
  ('leurs', 'NNS'),
  ('délicieux', 'VBP'),
  ('bubbles', 'NNS'),
  ('tea/smooties', 'NNS'),
  (',', ','),
  ('leurs', 'VBZ'),
  ('``', '``'),
  ('Bánh', 'NNP'),
  ('mì', 'NN'),
  ("''", "''"),
  (',', ','),
  ('leurs', 'VBZ'),
  ('petits', 'NNS'),
  ('snacks', 'NNS'),
  ('(', '('),
  ('viennoiseries', 'NNS'),
  (',', ','),
  ('tapioca', 'NN'),
  (',', ','),
  ('...', ':'),
  (')', ')'),
  (',', ','),
  ('on', 'IN'),
  ('adore', 'IN'),
  ('Vua', 'NNP'),
  ('et', 'CC'),
  ('aussi', 'JJ'),
  ('leurs', 'NNS'),
  ('prix', 'VBP'),
  ('très', 'JJ'),
  ('abordables', 'NNS'),
  ('.', '.'),
  ('On', 'IN'),
  ('y', 'JJ'),
  ('retourne', 'JJ'),
  ("lorsqu'on", 'NN'),
  ('est', 'JJS'),
  ('dans', 'NNS'),
  ('le', 'VBP'),
  ('Quartier', 'NNP'),
  ('Latin', 'NNP'),
  ('!', '.')],
 [('As', 'IN'),
  ('I', 'PRP'),
  ("'ve", 'VBP'),
  ('said', 'VBD'),
  ('previously', 'RB'),
  ('...', ':'),
  ('we', 'PRP'),
  ("'ve", 'VBP'),
  ('been

In [313]:
nlp = spacy.load("en_core_web_sm")
spacy_tagged = []
for sentence in random_sentences:
    spacy_tagged.append(nlp(sentence))
for tagged in spacy_tagged:
    for token in tagged:
        print(f'{token.text:{8}} {token.pos_:{6}}')

Que      PROPN 
ce       PROPN 
soit     ADJ   
pour     NOUN  
leurs    VERB  
délicieux NOUN  
bubbles  NOUN  
tea      NOUN  
/        SYM   
smooties NOUN  
,        PUNCT 
leurs    VERB  
''       PUNCT 
Bánh     PROPN 
mì       INTJ  
''       PUNCT 
,        PUNCT 
leurs    NOUN  
petits   VERB  
snacks   NOUN  
(        PUNCT 
viennoiseries NOUN  
,        PUNCT 
tapioca  INTJ  
,        PUNCT 
...      PUNCT 
)        PUNCT 
,        PUNCT 
on       ADP   
adore    PROPN 
Vua      PROPN 
et       PROPN 
aussi    PROPN 
leurs    VERB  
prix     NOUN  
très     ADJ   
abordables NOUN  
.        PUNCT 
On       ADP   
y        PROPN 
retourne VERB  
lorsqu'on PROPN 
est      PROPN 
dans     PROPN 
le       X     
Quartier PROPN 
Latin    PROPN 
!        PUNCT 
As       ADP   
I        PRON  
've      AUX   
said     VERB  
previously ADV   
...      PUNCT 
we've    PROPN 
been     AUX   
coming   VERB  
to       ADP   
LMAH     PROPN 
for      ADP   
over     ADP   
10       NUM 

# WORK COMPLETED UP TILL HERE.

### Writing Style

In [314]:
def getSOFurl():
    while(True):
        ip0 = 'stackoverflow'
        ip1 = 'com'
        ip2 = 'questions'
        ip3 = str(random.randint(0, 100000000))
        url = 'https://' + ip0 + '.' + ip1 + '/'+ ip2 + '/'+ ip3
        try:
            urlContent = urlopen(url).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
            pass

    print("Found URL: " + url)
    page1 = requests.get(url)
    soup1 = BeautifulSoup(page1.content, "html.parser")
    text = list(soup1.find_all("p"))
    sof_text = [txt.get_text() for txt in text]
   
    return sof_text


In [315]:
def getHWZurl():
    url = 'https://www.hardwarezone.com.sg/home'
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    urls = []
    for link in soup.find_all('a'):
        urls.append(link.get('href'))
    random.shuffle(urls)
    while(True):
        for url in urls:
            print(url)
            try:
                url = 'https://www.hardwarezone.com.sg' + url
                urlContent = urlopen(url).read()
                page1 = requests.get(url)
                soup1 = BeautifulSoup(page1.content, "html.parser")
                text = list(soup1.find_all("p"))
                hwz_text = [txt.get_text() for txt in text]
                if urlContent and len(hwz_text)>10:
                    break
            except Exception as e: 
                print(e)
        break
    print("Found URL: " + url)
    return hwz_text

#### getting random CNA urls
## TBD

https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup

In [316]:
url = 'https://www.channelnewsasia.com/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
 
urls = []
for link in soup.find_all('a'):
    urls.append(link.get('href'))
urls

[]

In [317]:
while(True):
    ip0 = 'https://www.channelnewsasia.com/'
    for url in urls:
        url2 = ip0 + url
        try:
            urlContent = urlopen(url2).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
            pass
    break

print("Found URL: " + url)

Found URL: https://www.channelnewsasia.com/


#### Cleaning text data

In [318]:
sof_text1 = getSOFurl()
sof_text2 = getSOFurl()

Found URL: https://stackoverflow.com/questions/506599
Found URL: https://stackoverflow.com/questions/16295122


In [319]:
hwz_text1 = getHWZurl()
hwz_text2 = getHWZurl()

https://www.hardwarezone.com.sg/tech-news-alan-wake-remastered-brings-4k-visuals-bright-falls-october
<urlopen error [Errno 11001] getaddrinfo failed>
None
can only concatenate str (not "NoneType") to str
/features
/review-samsung-galaxy-z-fold3-review-cheaper-better-foldable-phone-features-specs-price-singapore
Found URL: https://www.hardwarezone.com.sg/review-samsung-galaxy-z-fold3-review-cheaper-better-foldable-phone-features-specs-price-singapore
/product-guide/212-input-devices/home
/product-guide/49275-smart-living-lifestyle/home
/product-guide/47830-movies-shows/home
/feeds
/product-guide/47829-toys-collectibles/home
/product-guide/197-storage/home
/product-guide/47825-smartphones-tablets/home
https://www.hardwarezone.com.sg/tech-news-final-trailer-no-time-die-farewell-daniel-craig-james-bond
<urlopen error [Errno 11001] getaddrinfo failed>
/hwz-tv
/product-guide/6582-automotive-tech/home
https://www.hardwarezone.com.sg/product-guide/217/home
<urlopen error [Errno 11001] getaddr

##### Cleaning SOF text data

In [320]:
sof_text1 = sof_text1[4:]
sof_text1 = sof_text1[:-10]
temp_list = []
for line in sof_text1:
    temp_list += sent_tokenize(line)
sof_text1 = temp_list

In [321]:
sof_text1

['I am a web developer (J2EE application developer) and just want to expand what tools I use.',
 'I want to use Open Solaris for my personal projects.',
 'I have nothing against Linux and It looks like a lot of the same tools are on both systems.',
 'Have you jumped to Solaris, was it a good experience?',
 "DTrace, zones, switch between 32 bit and 64 bit mode with a single GRUB switch, ZFS, stable libraries (I can't really emphasize that one enough).",
 'Solaris 7 software generally runs on OpenSolaris, otherwise known as Solaris 11.  glibc changes between minor kernel releases.',
 'Xen is integrated pretty tightly, and setting up lx zones or virtualization to keep your Linux environment is dead simple.',
 'OpenSolaris now has /usr/bin/gnu, where all you favorite utilities can be found.',
 'Expect, though, to end up fighting the ./configure && make && make install cycle a little bit.',
 "A lot of developers assume you're running Linux, and don't prepend -m64 for Solaris, among other th

In [322]:
sof_text2 = sof_text2[4:]
sof_text2 = sof_text2[:-10]
temp_list = []
for line in sof_text2:
    temp_list += sent_tokenize(line)
sof_text2 = temp_list

##### Cleaning CNA text data

##### Cleaning HWZ text data

In [323]:
hwz_text1 = hwz_text1[:-5]
temp_list = []
for line in hwz_text1:
    temp_list += sent_tokenize(line)
hwz_text1 = temp_list

In [324]:
hwz_text2 = hwz_text2[:-5]
temp_list = []
for line in hwz_text2:
    temp_list += sent_tokenize(line)
hwz_text2 = temp_list

##### First word in sentence capitalized?

In [325]:
count=0
uppercount=0
for sent in sof_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for sof_text1: ", uppercount/count)

Fraction of first letter being capitalised for sof_text1:  0.9743589743589743


In [326]:
count=0
uppercount=0
for sent in sof_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for sof_text2: ", uppercount/count)

Fraction of first letter being capitalised for sof_text2:  1.0


In [327]:
count=0
uppercount=0
for sent in hwz_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for hwz_text1: ", uppercount/count)

Fraction of first letter being capitalised for hwz_text1:  0.9864864864864865


In [328]:
count=0
uppercount=0
for sent in hwz_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for hwz_text2: ", uppercount/count)

Fraction of first letter being capitalised for hwz_text2:  1.0


##### Length of articles?

In [329]:
print("No of sentences in sof_text1: ", len(sof_text1))
print("No of sentences in sof_text2: ", len(sof_text2))
print("No of sentences in hwz_text1: ", len(hwz_text1))
print("No of sentences in hwz_text2: ", len(hwz_text2))

No of sentences in sof_text1:  39
No of sentences in sof_text2:  18
No of sentences in hwz_text1:  74
No of sentences in hwz_text2:  15


##### Proper nouns capitalised?

In [338]:
tagged = []
uppercount = 0
count = 0
for sentence in sof_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in sof_text1: ', uppercount/count)

Fraction of proper nouns capitalised in sof_text1:  0.8363636363636363


In [339]:
tagged = []
uppercount = 0
count = 0
for sentence in sof_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in sof_text2: ', uppercount/count)

Fraction of proper nouns capitalised in sof_text2:  1.0


In [340]:
tagged = []
uppercount = 0
count = 0
for sentence in hwz_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in hwz_text1: ', uppercount/count)

Fraction of proper nouns capitalised in hwz_text1:  0.9369747899159664


In [341]:
tagged = []
uppercount = 0
count = 0
for sentence in hwz_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in hwz_text2: ', uppercount/count)

Fraction of proper nouns capitalised in hwz_text2:  1.0


##### Good grammar?
1. Tense matching
2. Subject-verb agreement