In [1]:
# Import the libraries needed
from bs4 import BeautifulSoup, SoupStrainer
from collections import Counter
import httplib2
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
import urllib.request
from urllib.request import urlopen, Request
import random
import re
import requests

In [2]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [3]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

## 3.2 Dataset Analysis

### Tokenisation and Stemming

In [4]:
# get reviews for a random business 
random_business = reviews.sample()
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
8,dunRtl-WLvrM9ZMZUJnEHw,1BUt6Cv2oTDqwCNOvCSCZw,k99YNCx5KcQPR7oeTgAOzg,5,0,0,0,The food was amazing. The filet and lobster ta...,2014-10-11 05:26:02
19,7q5u8n0mA9QKbIFDs6rzKw,btnGLjGvnTuN23L8FS0JLg,k99YNCx5KcQPR7oeTgAOzg,5,0,0,0,This outback location really knows how to keep...,2016-06-23 00:17:39
34,emiaea9I23YW81xEsRrAvg,ugSIAOFv9JmtTXqaE9YYzg,k99YNCx5KcQPR7oeTgAOzg,1,0,1,0,The WORST wings I've ever eaten!! I cancelled...,2011-06-29 09:48:39
55,Fik8awi-NpTA0iTrRNSLVA,l2scGRLx8VDznaykohVveQ,k99YNCx5KcQPR7oeTgAOzg,4,0,0,0,Two separate experiences: Food Quality good (f...,2016-10-27 03:10:20
66,5BGS0eTMmOCOrrSWxkKRfA,4TY8dDlgoso_xdfVvLkW1w,k99YNCx5KcQPR7oeTgAOzg,3,2,0,2,Last night our experience at this Outback loca...,2013-05-29 20:24:51


In [5]:
small_business_dataset_reviews = list(small_business_dataset['text'])

In [6]:
# convert the reviews into a concatenated string 
b1_review = ''.join(small_business_dataset_reviews)
clean_review = re.sub(r"[^A-Za-z0-9\s]+", "", b1_review)
b1_review = nlp(clean_review)

In [7]:
# removed punctuation and get the top 10 most common words (including stopwords)
b1_review_words = [token.text for token in b1_review if token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('the', 464), ('and', 459), ('was', 330), ('to', 324), ('I', 313), ('a', 287), ('it', 167), ('for', 146), ('of', 122), ('that', 121)]


In [8]:
# removed punctuation and get the top 10 most common words (excluding stopwords)
b1_review_words = [token.text for token in b1_review if token.is_stop != True and token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('nt', 108), ('steak', 76), ('Outback', 73), ('food', 62), ('good', 55), ('ordered', 47), ('came', 46), ('service', 43), ('time', 32), ('order', 31)]


In [9]:
#TODO: plot log graph


In [10]:
# now we do some stemming after removing the stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

porter_st = PorterStemmer()
lancaster_st = LancasterStemmer()
snow_st = SnowballStemmer("english")

In [11]:
# Using Porter Stemmer
porter_stemmed_words = [porter_st.stem(word) for word in b1_review_words]
porter_freq = Counter(porter_stemmed_words)
porter_common = porter_freq.most_common(10)
print(porter_common)

[('nt', 108), ('steak', 97), ('outback', 90), ('order', 88), ('food', 65), ('good', 63), ('servic', 52), ('time', 48), ('came', 46), ('like', 40)]


In [12]:
# Using Lancaster Stemmer
lancaster_stemmed_words = [lancaster_st.stem(word) for word in b1_review_words]
lancaster_freq = Counter(lancaster_stemmed_words)
lancaster_common = lancaster_freq.most_common(10)
print(lancaster_common)

[('nt', 108), ('serv', 101), ('steak', 97), ('outback', 90), ('ord', 88), ('food', 65), ('good', 64), ('wait', 50), ('tim', 48), ('cam', 46)]


In [13]:
# Using Snowball Stemmer
snow_stemmed_words = [snow_st.stem(word) for word in b1_review_words]
snow_freq = Counter(snow_stemmed_words)
snow_common = snow_freq.most_common(10)
print(snow_common)

[('nt', 108), ('steak', 97), ('outback', 90), ('order', 88), ('food', 65), ('good', 63), ('servic', 52), ('time', 48), ('came', 46), ('like', 40)]


### POS Tagging

In [14]:
random_sentences = reviews.sample(5, random_state=42)
random_sentences = list(random_sentences['text'])

In [15]:
random_sentences[0]

"Que ce soit pour leurs délicieux bubbles tea/smooties, leurs ''Bánh mì'' , leurs petits snacks (viennoiseries, tapioca, ...), on adore Vua et aussi leurs prix très abordables. On y retourne lorsqu'on est dans le Quartier Latin !"

In [16]:
nltk_tagged = []
for sentence in random_sentences:
    nltk_tagged.append((nltk.pos_tag(word_tokenize(sentence))))
nltk_tagged

[[('Que', 'NNP'),
  ('ce', 'NN'),
  ('soit', 'VBD'),
  ('pour', 'JJ'),
  ('leurs', 'NNS'),
  ('délicieux', 'VBP'),
  ('bubbles', 'NNS'),
  ('tea/smooties', 'NNS'),
  (',', ','),
  ('leurs', 'VBZ'),
  ('``', '``'),
  ('Bánh', 'NNP'),
  ('mì', 'NN'),
  ("''", "''"),
  (',', ','),
  ('leurs', 'VBZ'),
  ('petits', 'NNS'),
  ('snacks', 'NNS'),
  ('(', '('),
  ('viennoiseries', 'NNS'),
  (',', ','),
  ('tapioca', 'NN'),
  (',', ','),
  ('...', ':'),
  (')', ')'),
  (',', ','),
  ('on', 'IN'),
  ('adore', 'IN'),
  ('Vua', 'NNP'),
  ('et', 'CC'),
  ('aussi', 'JJ'),
  ('leurs', 'NNS'),
  ('prix', 'VBP'),
  ('très', 'JJ'),
  ('abordables', 'NNS'),
  ('.', '.'),
  ('On', 'IN'),
  ('y', 'JJ'),
  ('retourne', 'JJ'),
  ("lorsqu'on", 'NN'),
  ('est', 'JJS'),
  ('dans', 'NNS'),
  ('le', 'VBP'),
  ('Quartier', 'NNP'),
  ('Latin', 'NNP'),
  ('!', '.')],
 [('As', 'IN'),
  ('I', 'PRP'),
  ("'ve", 'VBP'),
  ('said', 'VBD'),
  ('previously', 'RB'),
  ('...', ':'),
  ('we', 'PRP'),
  ("'ve", 'VBP'),
  ('been

In [17]:
nlp = spacy.load("en_core_web_sm")
spacy_tagged = []
for sentence in random_sentences:
    spacy_tagged.append(nlp(sentence))
for tagged in spacy_tagged:
    for token in tagged:
        print(f'{token.text:{8}} {token.pos_:{6}}')

Que      PROPN 
ce       PROPN 
soit     ADJ   
pour     NOUN  
leurs    VERB  
délicieux NOUN  
bubbles  NOUN  
tea      NOUN  
/        SYM   
smooties NOUN  
,        PUNCT 
leurs    VERB  
''       PUNCT 
Bánh     PROPN 
mì       INTJ  
''       PUNCT 
,        PUNCT 
leurs    NOUN  
petits   VERB  
snacks   NOUN  
(        PUNCT 
viennoiseries NOUN  
,        PUNCT 
tapioca  INTJ  
,        PUNCT 
...      PUNCT 
)        PUNCT 
,        PUNCT 
on       ADP   
adore    PROPN 
Vua      PROPN 
et       PROPN 
aussi    PROPN 
leurs    VERB  
prix     NOUN  
très     ADJ   
abordables NOUN  
.        PUNCT 
On       ADP   
y        PROPN 
retourne VERB  
lorsqu'on PROPN 
est      PROPN 
dans     PROPN 
le       X     
Quartier PROPN 
Latin    PROPN 
!        PUNCT 
As       ADP   
I        PRON  
've      AUX   
said     VERB  
previously ADV   
...      PUNCT 
we've    PROPN 
been     AUX   
coming   VERB  
to       ADP   
LMAH     PROPN 
for      ADP   
over     ADP   
10       NUM 

# WORK COMPLETED UP TILL HERE.

### Writing Style

#### Getting random stackOverflow urls

https://code.activestate.com/recipes/577389-random-url/

In [18]:
while(True):
    ip0 = 'stackoverflow'
    ip1 = 'com'
    ip2 = 'questions'
    ip3 = str(random.randint(0, 100000000))
    url = 'https://' + ip0 + '.' + ip1 + '/'+ ip2 + '/'+ ip3
    try:
        urlContent = urlopen(url).read()
        if urlContent:
            break
    except Exception as e: 
        print(e)
        pass

print("Found URL: " + url)

Found URL: https://stackoverflow.com/questions/36512415


In [19]:
# I think that the results of find_all might be more relevant but we can discuss this
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
sof_text1 = [txt.get_text() for txt in text]
sof_text1

['Find centralized, trusted content and collaborate around the technologies you use most.',
 'Teams',
 'Q&A for work',
 'Connect and share knowledge within a single location that is structured and easy to search.',
 'I am using Swift and I want to be able to load a UIViewController when I rotate to landscape, can anyone point me in the right direction?',
 "I Can't find anything online and a little bit confused by the documentation.",
 "Here's how I got it working:",
 'In AppDelegate.swift inside the didFinishLaunchingWithOptions  function I put:',
 'and then inside the AppDelegate class I put the following function: ',
 'Hope this helps anyone else!',
 'Thanks!',
 'According to the Apple docs:',
 "This method is called when the view controller's view's size is changed by its parent (i.e. for the root view controller when its window rotates or is resized).",
 'I need to detect rotation while using the camera with AVFoundation, and found that the didRotate (now deprecated) & willTransiti

In [20]:
while(True):
    ip0 = 'stackoverflow'
    ip1 = 'com'
    ip2 = 'questions'
    ip3 = str(random.randint(0, 100000000))
    url2 = 'https://' + ip0 + '.' + ip1 + '/'+ ip2 + '/'+ ip3
    try:
        urlContent = urlopen(url2).read()
        if urlContent:
            break
    except Exception as e: 
        print(e)
        pass

print("Found URL: " + url)

Found URL: https://stackoverflow.com/questions/36512415


In [21]:
# I think that the results of find_all might be more relevant but we can discuss this
page1 = requests.get(url2)
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
sof_text2 = [txt.get_text() for txt in text]
sof_text1

['Find centralized, trusted content and collaborate around the technologies you use most.',
 'Teams',
 'Q&A for work',
 'Connect and share knowledge within a single location that is structured and easy to search.',
 "I need add a new column (status) on my main table (Table 1) on my bd Postgres, to do this , I need to compare two columns from table 1 on table 2 , if the two columns exist , I need to copy the status from table 2 to the new column on table 1 , if not exist , e need to fill the status column with 'New'. Do youw know if it´s possible ? how ?",
 'Table 1',
 'Table 2',
 'Table 1 with a new status column',
 'Thanks',
 'You are describing a LEFT join of Table1 to Table2:',
 'Thanks for contributing an answer to Stack Overflow!',
 'But avoid …',
 'To learn more, see our tips on writing great answers.',
 'Required, but never shown',
 'Required, but never shown',
 '\r\n                                                By clicking “Post Your Answer”, you agree to our terms of service

#### getting random CNA urls

https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup

In [22]:
url = 'https://www.channelnewsasia.com/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
 
urls = []
for link in soup.find_all('a'):
    urls.append(link.get('href'))
urls

[]

In [23]:
while(True):
    ip0 = 'https://www.channelnewsasia.com/'
    for url in urls:
        url2 = ip0 + url
        try:
            urlContent = urlopen(url2).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
            pass

print("Found URL: " + url)

KeyboardInterrupt: 

#### getting random hardwareZone urls

In [64]:
url = 'https://www.hardwarezone.com.sg/home'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
 
urls = []
for link in soup.find_all('a'):
    urls.append(link.get('href'))
random.shuffle(urls)
urls

['/tech-news-possible-announcement-date-google-pixel-6-revealed-new-ads',
 '/review-razer-blade-14-impressive-gaming-ultraportable',
 '/product-guide/196-graphics-cards/home',
 'https://coupons.hardwarezone.com.sg/',
 '/feature-hwz-favourite-apple-iphone-ios-15-features',
 '/tech-news-kevin-lynch-is-reportedly-leading-apple-car-project',
 'https://coupons.hardwarezone.com.sg/grab-food-voucher-code',
 'https://forums.hardwarezone.com.sg/forumdisplay.php?f=225',
 'https://www.hardwarezone.com.sg/tech-news-blackpink-lisa-first-single-album-lalisa-spatial-audio-apple-music',
 '/tech-news-vivo-x70-series-phones-get-global-launch-packing-zeiss-optics-and-5g-support',
 '/product-guide/263-printers/home',
 '/product-guide/211-coolers/home',
 '/tech-news-final-trailer-no-time-die-farewell-daniel-craig-james-bond',
 '/product-guide/154-notebooks/home',
 '/product-guide/47835-gaming-entertainment-culture/home',
 '/tech-news-purported-storage-and-colour-options-apple-iphone-13-revealed',
 '/tech-n

In [65]:
while(True):
    for url in urls:
        print(url)
        try:
            url = 'https://www.hardwarezone.com.sg' + url
            urlContent = urlopen(url).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
    break
        

print("Found URL: " + url)

/tech-news-possible-announcement-date-google-pixel-6-revealed-new-ads
Found URL: https://www.hardwarezone.com.sg/tech-news-possible-announcement-date-google-pixel-6-revealed-new-ads


In [66]:
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
hwz_text1 = [txt.get_text() for txt in text]
hwz_text1

['',
 "Google's latest teaser ads give us the first real-world look at the Pixel 6 and a possible hint of its announcement date.\xa0",
 'In the 30-second teaser video, Google shows us the design of the Pixel 6, its colour options, the new Material You of Android 12, and refreshed widgets. The company also teases the capabilities of its Tensor chipset with terms such as "learning", "evolving" and "adapting".',
 '',
 'Google also posted a teaser on Instagram which shows off the different Material You Android 12 widgets on the home screen of the Pixel 6. Android Police took a closer look at the clock widget and discovered that the date is stated "Tue, 19".\xa0',
 'If this date seems familiar, tipster Jon Prosser reported a few days ago that pre-orders for the Pixel 6 will start on 19 October with retail availability slated for 28 October. Prosser also shared the specs of the Pixel 6 and Pixel 6 Pro in July.',
 'Source: Google (1) (2) via Android Police',
 'Trending Topics',
 'Our Sections

In [72]:
random.shuffle(urls)
while(True):
    for url in urls:
        print(url)
        try:
            url = 'https://www.hardwarezone.com.sg' + url
            urlContent = urlopen(url).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
    break
        

print("Found URL: " + url)

/tech-news-iphone-14-under-display-face-id-titanium-body
Found URL: https://www.hardwarezone.com.sg/tech-news-iphone-14-under-display-face-id-titanium-body


In [73]:
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
hwz_text2 = [txt.get_text() for txt in text]
hwz_text2

['',
 "Apple hasn't even announced this year's iPhones\xa0but that isn't stopping the rumour mill from speculating features that we could see on next year's iPhones.",
 "According to noted Apple leaker Dan Prosser, next year's iPhones could feature a notch-less display made possible by an under-display Face ID system.",
 "Renders show next year's iPhones having a hole-punch camera – not unlike the systems currently employed by high-end Android phones.",
 '',
 "Additionally, Prosser also said that next year's models will be thicker and that the rear camera bump will be eliminated. He also said that he expects the phones to have titanium frames.\xa0",
 "Given that the iPhone 14 is still so far away, I'd say take these rumours with a huge pinch of salt.\xa0",
 "That said, given Apple's history, it's likely that we won't see a huge redesign this year and that'll only come next year.\xa0",
 'Apple is expected to announce this year\'s iPhones at next week\'s "California streaming" event.',
 