In [309]:
# Import the libraries needed
from bs4 import BeautifulSoup, SoupStrainer
from collections import Counter
import httplib2
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
import urllib.request
from urllib.request import urlopen, Request
import random
import re
import requests

In [310]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [311]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

## 3.2 Dataset Analysis

### Tokenisation and Stemming

In [312]:
# get reviews for a random business 
random_business = reviews.sample()
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
11200,wysFaMq5S88mF6HLxdh3Vw,4nQ7MOkbc5u-DYUOgjiDsg,yEZn1XpLsEC9uBa-X4xAZw,4,1,0,0,Cute little vietnamese sandwich place on St De...,2016-03-17 14:29:44
11204,z81YK3CXQJb48BU7ZAcf7A,Ht8iGitRu8kynEubcIhsTQ,yEZn1XpLsEC9uBa-X4xAZw,4,7,2,8,"Hey Hey, it's the SAMMIES!\n\nSuper Yummy here...",2011-11-30 03:26:01
11209,tqajcnZA97HlXkIYrdNVlg,-a0XgJCXoJln2Ue_JMnfIQ,yEZn1XpLsEC9uBa-X4xAZw,3,0,0,0,Sandwich was good but service was terrible. We...,2016-07-14 18:52:34
11212,wXz1c5kz0QB2nvZVTnnkIg,F9ivL7-mzKbfnLO1rKUFFQ,yEZn1XpLsEC9uBa-X4xAZw,5,0,0,0,"Super fresh, delicious and cheap Banh Mi in th...",2017-07-28 17:48:01
11258,89J69mHgTTF25Ln97ONTrQ,La1ggB37gvq6UWpEWaMtMA,yEZn1XpLsEC9uBa-X4xAZw,4,2,0,2,Que ce soit pour leurs délicieux bubbles tea/s...,2012-12-04 21:25:13


In [313]:
small_business_dataset_reviews = list(small_business_dataset['text'])

In [314]:
# convert the reviews into a concatenated string 
b1_review = ''.join(small_business_dataset_reviews)
clean_review = re.sub(r"[^A-Za-z0-9\s]+", "", b1_review)
b1_review = nlp(clean_review)

In [315]:
# removed punctuation and get the top 10 most common words (including stopwords)
b1_review_words = [token.text for token in b1_review if token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('the', 340), ('and', 303), ('I', 231), ('a', 223), ('to', 154), ('of', 133), ('for', 122), ('in', 114), ('is', 111), ('was', 111)]


In [316]:
# removed punctuation and get the top 10 most common words (excluding stopwords)
b1_review_words = [token.text for token in b1_review if token.is_stop != True and token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('sandwich', 70), ('mi', 70), ('de', 60), ('nt', 58), ('banh', 58), ('place', 57), ('sandwiches', 51), ('good', 48), ('Vietnamese', 48), ('food', 39)]


In [317]:
#TODO: plot log graph


In [318]:
# now we do some stemming after removing the stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

porter_st = PorterStemmer()
lancaster_st = LancasterStemmer()
snow_st = SnowballStemmer("english")

In [319]:
# Using Porter Stemmer
porter_stemmed_words = [porter_st.stem(word) for word in b1_review_words]
porter_freq = Counter(porter_stemmed_words)
porter_common = porter_freq.most_common(10)
print(porter_common)

[('sandwich', 141), ('mi', 92), ('de', 80), ('banh', 73), ('place', 69), ('nt', 58), ('vietnames', 56), ('good', 53), ('sushi', 43), ('le', 43)]


In [320]:
# Using Lancaster Stemmer
lancaster_stemmed_words = [lancaster_st.stem(word) for word in b1_review_words]
lancaster_freq = Counter(lancaster_stemmed_words)
lancaster_common = lancaster_freq.most_common(10)
print(lancaster_common)

[('sandwich', 142), ('mi', 90), ('banh', 73), ('plac', 71), ('de', 61), ('nt', 58), ('vietnames', 56), ('good', 53), ('sush', 43), ('food', 42)]


In [321]:
# Using Snowball Stemmer
snow_stemmed_words = [snow_st.stem(word) for word in b1_review_words]
snow_freq = Counter(snow_stemmed_words)
snow_common = snow_freq.most_common(10)
print(snow_common)

[('sandwich', 141), ('mi', 90), ('banh', 73), ('place', 69), ('de', 61), ('nt', 58), ('vietnames', 56), ('good', 53), ('sushi', 43), ('food', 42)]


### POS Tagging

In [322]:
random_sentences = reviews.sample(5, random_state=42)
random_sentences = list(random_sentences['text'])

In [323]:
random_sentences[0]

"Que ce soit pour leurs délicieux bubbles tea/smooties, leurs ''Bánh mì'' , leurs petits snacks (viennoiseries, tapioca, ...), on adore Vua et aussi leurs prix très abordables. On y retourne lorsqu'on est dans le Quartier Latin !"

In [324]:
nltk_tagged = []
for sentence in random_sentences:
    nltk_tagged.append((nltk.pos_tag(word_tokenize(sentence))))
nltk_tagged

[[('Que', 'NNP'),
  ('ce', 'NN'),
  ('soit', 'VBD'),
  ('pour', 'JJ'),
  ('leurs', 'NNS'),
  ('délicieux', 'VBP'),
  ('bubbles', 'NNS'),
  ('tea/smooties', 'NNS'),
  (',', ','),
  ('leurs', 'VBZ'),
  ('``', '``'),
  ('Bánh', 'NNP'),
  ('mì', 'NN'),
  ("''", "''"),
  (',', ','),
  ('leurs', 'VBZ'),
  ('petits', 'NNS'),
  ('snacks', 'NNS'),
  ('(', '('),
  ('viennoiseries', 'NNS'),
  (',', ','),
  ('tapioca', 'NN'),
  (',', ','),
  ('...', ':'),
  (')', ')'),
  (',', ','),
  ('on', 'IN'),
  ('adore', 'IN'),
  ('Vua', 'NNP'),
  ('et', 'CC'),
  ('aussi', 'JJ'),
  ('leurs', 'NNS'),
  ('prix', 'VBP'),
  ('très', 'JJ'),
  ('abordables', 'NNS'),
  ('.', '.'),
  ('On', 'IN'),
  ('y', 'JJ'),
  ('retourne', 'JJ'),
  ("lorsqu'on", 'NN'),
  ('est', 'JJS'),
  ('dans', 'NNS'),
  ('le', 'VBP'),
  ('Quartier', 'NNP'),
  ('Latin', 'NNP'),
  ('!', '.')],
 [('As', 'IN'),
  ('I', 'PRP'),
  ("'ve", 'VBP'),
  ('said', 'VBD'),
  ('previously', 'RB'),
  ('...', ':'),
  ('we', 'PRP'),
  ("'ve", 'VBP'),
  ('been

In [325]:
nlp = spacy.load("en_core_web_sm")
spacy_tagged = []
for sentence in random_sentences:
    spacy_tagged.append(nlp(sentence))
for tagged in spacy_tagged:
    for token in tagged:
        print(f'{token.text:{8}} {token.pos_:{6}}')

Que      PROPN 
ce       PROPN 
soit     ADJ   
pour     NOUN  
leurs    VERB  
délicieux NOUN  
bubbles  NOUN  
tea      NOUN  
/        SYM   
smooties NOUN  
,        PUNCT 
leurs    VERB  
''       PUNCT 
Bánh     PROPN 
mì       INTJ  
''       PUNCT 
,        PUNCT 
leurs    NOUN  
petits   VERB  
snacks   NOUN  
(        PUNCT 
viennoiseries NOUN  
,        PUNCT 
tapioca  INTJ  
,        PUNCT 
...      PUNCT 
)        PUNCT 
,        PUNCT 
on       ADP   
adore    PROPN 
Vua      PROPN 
et       PROPN 
aussi    PROPN 
leurs    VERB  
prix     NOUN  
très     ADJ   
abordables NOUN  
.        PUNCT 
On       ADP   
y        PROPN 
retourne VERB  
lorsqu'on PROPN 
est      PROPN 
dans     PROPN 
le       X     
Quartier PROPN 
Latin    PROPN 
!        PUNCT 
As       ADP   
I        PRON  
've      AUX   
said     VERB  
previously ADV   
...      PUNCT 
we've    PROPN 
been     AUX   
coming   VERB  
to       ADP   
LMAH     PROPN 
for      ADP   
over     ADP   
10       NUM 

# WORK COMPLETED UP TILL HERE.

### Writing Style

#### Getting random stackOverflow urls

https://code.activestate.com/recipes/577389-random-url/

In [326]:
while(True):
    ip0 = 'stackoverflow'
    ip1 = 'com'
    ip2 = 'questions'
    ip3 = str(random.randint(0, 100000000))
    url = 'https://' + ip0 + '.' + ip1 + '/'+ ip2 + '/'+ ip3
    try:
        urlContent = urlopen(url).read()
        if urlContent:
            break
    except Exception as e: 
        print(e)
        pass

print("Found URL: " + url)

Found URL: https://stackoverflow.com/questions/2439375


In [327]:
# I think that the results of find_all might be more relevant but we can discuss this
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
sof_text1 = [txt.get_text() for txt in text]
sof_text1

['Find centralized, trusted content and collaborate around the technologies you use most.',
 'Teams',
 'Q&A for work',
 'Connect and share knowledge within a single location that is structured and easy to search.',
 "I'm trying to free g_strdup but I'm not sure what I'm doing wrong. ",
 'Using valgrind --tool=memcheck --leak-check=yes ./a.out I keep getting:',
 "I've tried different ways to freed but no success so far. I'll appreciate any help. Thanks ",
 'BTW: It compiles and runs fine.',
 'g_strdup(key) allocates memory, but nobody frees that memory.',
 'You should probably provide your own key_destroy_func to g_hash_table_new_full instead of using g_hash_table_new.',
 "Why to you g_strdup() every key you put into the hashtable? Are you required to do so? If GTK requires you to dup every char* key in the hashtable  I'd bet it frees theam when doing g_hash_table_destroy().",
 'Check GTK dokcumentation.',
 'Thanks for contributing an answer to Stack Overflow!',
 'But avoid …',
 'To lea

In [328]:
while(True):
    ip0 = 'stackoverflow'
    ip1 = 'com'
    ip2 = 'questions'
    ip3 = str(random.randint(0, 100000000))
    url2 = 'https://' + ip0 + '.' + ip1 + '/'+ ip2 + '/'+ ip3
    try:
        urlContent = urlopen(url2).read()
        if urlContent:
            break
    except Exception as e: 
        print(e)
        pass

print("Found URL: " + url)

HTTP Error 404: Not Found
HTTP Error 404: Not Found
Found URL: https://stackoverflow.com/questions/2439375


In [329]:
# I think that the results of find_all might be more relevant but we can discuss this
page1 = requests.get(url2)
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
sof_text2 = [txt.get_text() for txt in text]
sof_text2

['Find centralized, trusted content and collaborate around the technologies you use most.',
 'Teams',
 'Q&A for work',
 'Connect and share knowledge within a single location that is structured and easy to search.',
 'I have a page action.php on which I run an SQL query through the code, so that whenever the page is viewed the query runs like its like counting page views',
 'The problem is when the page is refreshed, the query is run & PAGE REFRESH is counted as a PAGE VIEW which I want to avoid.',
 '\xa0\xa0\xa0Question: How to avoid it ?',
 'What I am looking for is a simple solution so that I can check ',
 'I found this snippet here, and it worked perfectly for me:',
 'Best way to Detect Page Refresh. or Not ?(Ctrl+F5,F5,Ctrl+R, Enter) ',
 "You can't directly detect a page refresh, but you can use a cookie to simulate what you want:",
 'Depending on your requirements, you also need to decide when to remove the cookie and/or perform the action again.',
 'If you just want to run it onc

#### getting random CNA urls

https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup

In [330]:
url = 'https://www.channelnewsasia.com/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
 
urls = []
for link in soup.find_all('a'):
    urls.append(link.get('href'))
urls

['#main-content',
 '/',
 '/profile/login',
 '/profile',
 '/profile/myfeed',
 '#',
 '/international',
 '/singapore',
 '/asia',
 '/world',
 '/commentary',
 '/sustainability',
 '/business',
 '/sport',
 '/coronavirus-covid-19',
 '/latest-news',
 '/discover',
 '/discover',
 '/cna-insider',
 '/cna-insider',
 '/watch',
 '/watch',
 '/watch/all',
 '/watch/programmes',
 '/watch/tv-schedule',
 '/listen',
 '/listen',
 '/listen/all',
 '/listen/cna938/schedule',
 '/special-reports',
 '/parliament',
 '/interactives',
 'https://cnalifestyle.channelnewsasia.com/',
 'https://cnalifestyle.channelnewsasia.com/entertainment',
 'https://cnalifestyle.channelnewsasia.com/women',
 'https://cnalifestyle.channelnewsasia.com/wellness',
 'https://cnalifestyle.channelnewsasia.com/living',
 'https://cnalifestyle.channelnewsasia.com/style',
 'https://cnalifestyle.channelnewsasia.com/dining',
 'https://cnalifestyle.channelnewsasia.com/travel',
 'https://cnaluxury.channelnewsasia.com/',
 'https://cnaluxury.channelnewsa

In [331]:
while(True):
    ip0 = 'https://www.channelnewsasia.com/'
    for url in urls:
        url2 = ip0 + url
        try:
            urlContent = urlopen(url2).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
            pass
    break

print("Found URL: " + url)

Found URL: #main-content


#### getting random hardwareZone urls

In [332]:
url = 'https://www.hardwarezone.com.sg/home'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
 
urls = []
for link in soup.find_all('a'):
    urls.append(link.get('href'))
random.shuffle(urls)
urls

['/product-guide/49271-biz-sme-tech/home',
 '/product-guide/197-storage/home',
 '/product-guide/217-mobile-phones/home',
 None,
 '/marketplace/index',
 '#',
 '/product-guide/154-notebooks/home',
 '/sitemap',
 '/home/page:2',
 None,
 '/product-guide/195-motherboard/home',
 '/product-guide/6582-automotive-tech/home',
 'https://www.hardwarezone.com.sg/tech-news-star-wars-knights-old-republic-remade-playstation-5-and-pc',
 '/product-guide/211-coolers/home',
 '/product-guide/209-media-streamers-and-hubs/home',
 '/reviews',
 '/product-guide/209-media-streamers-and-hubs/home',
 '/product-guide/215-televisions/home',
 '/product-guide/215-televisions/home',
 '/product-guide',
 'https://coupons.hardwarezone.com.sg/shopee-promo-code',
 None,
 '/tech-news-possible-announcement-date-google-pixel-6-revealed-new-ads',
 '/product-guide/263-printers/home',
 'https://www.hardwarezone.com.sg/tech-news-grand-theft-auto-5-gta-online-next-gen-console-upgrades-delayed-2022',
 'https://forums.hardwarezone.com

In [333]:
while(True):
    for url in urls:
        print(url)
        try:
            url = 'https://www.hardwarezone.com.sg' + url
            urlContent = urlopen(url).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
    break
        

print("Found URL: " + url)

/product-guide/49271-biz-sme-tech/home
Found URL: https://www.hardwarezone.com.sg/product-guide/49271-biz-sme-tech/home


In [334]:
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
hwz_text1 = [txt.get_text() for txt in text]
hwz_text1

['Trending Topics',
 'Our Sections',
 '\xa0',
 'AWARDS & ACCOLADES',
 'We thank all our supporters who have helped us garner these prestigious media awards in the industry!']

In [335]:
random.shuffle(urls)
while(True):
    for url in urls:
        print(url)
        try:
            url = 'https://www.hardwarezone.com.sg' + url
            urlContent = urlopen(url).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
    break
        

print("Found URL: " + url)

/product-guide/261-casings/home
Found URL: https://www.hardwarezone.com.sg/product-guide/261-casings/home


In [336]:
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
hwz_text2 = [txt.get_text() for txt in text]
hwz_text2

['Trending Topics',
 'Our Sections',
 '\xa0',
 'AWARDS & ACCOLADES',
 'We thank all our supporters who have helped us garner these prestigious media awards in the industry!']

#### First word in sentence capitalized?

##### Cleaning SOF text data

In [337]:
sof_text1 = sof_text1[4:]
sof_text1 = sof_text1[:-10]
sof_text1

["I'm trying to free g_strdup but I'm not sure what I'm doing wrong. ",
 'Using valgrind --tool=memcheck --leak-check=yes ./a.out I keep getting:',
 "I've tried different ways to freed but no success so far. I'll appreciate any help. Thanks ",
 'BTW: It compiles and runs fine.',
 'g_strdup(key) allocates memory, but nobody frees that memory.',
 'You should probably provide your own key_destroy_func to g_hash_table_new_full instead of using g_hash_table_new.',
 "Why to you g_strdup() every key you put into the hashtable? Are you required to do so? If GTK requires you to dup every char* key in the hashtable  I'd bet it frees theam when doing g_hash_table_destroy().",
 'Check GTK dokcumentation.']

In [338]:
sof_text2 = sof_text2[4:]
sof_text2 = sof_text2[:-10]
sof_text2

['I have a page action.php on which I run an SQL query through the code, so that whenever the page is viewed the query runs like its like counting page views',
 'The problem is when the page is refreshed, the query is run & PAGE REFRESH is counted as a PAGE VIEW which I want to avoid.',
 '\xa0\xa0\xa0Question: How to avoid it ?',
 'What I am looking for is a simple solution so that I can check ',
 'I found this snippet here, and it worked perfectly for me:',
 'Best way to Detect Page Refresh. or Not ?(Ctrl+F5,F5,Ctrl+R, Enter) ',
 "You can't directly detect a page refresh, but you can use a cookie to simulate what you want:",
 'Depending on your requirements, you also need to decide when to remove the cookie and/or perform the action again.',
 'If you just want to run it once for a user, I would set a session cookie and then use an if() statement.',
 'i have solved the problem ... HURRAHHH with no session & no cookies',
 'the solution is a combination of PHP : AJAX : JavaScript',
 'the