In [25]:
# Import the libraries needed
from bs4 import BeautifulSoup, SoupStrainer
from collections import Counter
# import httplib2
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd 
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
import urllib.request
from urllib.request import urlopen, Request
import random
import re
import requests

In [26]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [27]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

## 3.2 Dataset Analysis

### Tokenisation and Stemming

In [28]:
# get reviews for a random business 
random_business = reviews.sample()
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
3120,18wW_ZjWCkmCpIGSYlym4A,B19N0vr_Ff7hb5AyPxW_Qw,Rii85bzYKGC9P0zOyAem6A,2,0,0,0,"While the waitress we had was great, the resta...",2017-06-06 21:50:57
3129,CWVLFI0EblAcikZK2tCnvA,ypKZT3cHMUKv15JfKmxnkQ,Rii85bzYKGC9P0zOyAem6A,4,1,1,1,Simple chain steakhouse. Not a knock on their ...,2017-03-27 12:26:18
3133,I-7XrxlGpagxjBn1AOK9Og,sFXfLbJjxVag-ZW1pzjpvg,Rii85bzYKGC9P0zOyAem6A,4,0,0,0,Very good steaks! Place is always busy but the...,2015-08-19 18:19:12
3155,WBTgTv6RD3ZZaKFUdan3pg,IgiigXHxvmpCcVwWA0kbHA,Rii85bzYKGC9P0zOyAem6A,5,0,0,0,Was told 15 to 25 minutes for a table. Was sea...,2017-07-08 22:26:28
3163,Tp-59xgWhoyPE1xe9iffgA,6tDfetjsLRNS_RzgN74Qkg,Rii85bzYKGC9P0zOyAem6A,3,0,0,0,This was an above average restaurant experienc...,2017-09-09 22:36:53


In [29]:
small_business_dataset_reviews = list(small_business_dataset['text'])

In [30]:
# convert the reviews into a concatenated string 
b1_review = ''.join(small_business_dataset_reviews)
clean_review = re.sub(r"[^A-Za-z0-9\s]+", "", b1_review)
b1_review = nlp(clean_review)

In [31]:
# removed punctuation and get the top 10 most common words (including stopwords)
b1_review_words = [token.text for token in b1_review if token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('the', 420), ('and', 394), ('a', 287), ('I', 249), ('to', 244), ('was', 231), ('for', 144), ('of', 134), ('is', 115), ('we', 114)]


In [32]:
# removed punctuation and get the top 10 most common words (excluding stopwords)
b1_review_words = [token.text for token in b1_review if token.is_stop != True and token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('nt', 75), ('steak', 72), ('food', 66), ('great', 61), ('good', 60), ('service', 48), ('place', 44), ('time', 38), ('got', 37), ('Texas', 35)]


In [33]:
#TODO: plot log graph


In [34]:
# now we do some stemming after removing the stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

porter_st = PorterStemmer()
lancaster_st = LancasterStemmer()
snow_st = SnowballStemmer("english")

In [35]:
# Using Porter Stemmer
porter_stemmed_words = [porter_st.stem(word) for word in b1_review_words]
porter_freq = Counter(porter_stemmed_words)
porter_common = porter_freq.most_common(10)
print(porter_common)

[('steak', 105), ('nt', 75), ('great', 71), ('food', 70), ('order', 62), ('good', 62), ('servic', 56), ('place', 50), ('time', 47), ('seat', 44)]


In [36]:
# Using Lancaster Stemmer
lancaster_stemmed_words = [lancaster_st.stem(word) for word in b1_review_words]
lancaster_freq = Counter(lancaster_stemmed_words)
lancaster_common = lancaster_freq.most_common(10)
print(lancaster_common)

[('steak', 105), ('serv', 97), ('nt', 75), ('gre', 71), ('food', 71), ('ord', 62), ('good', 62), ('plac', 51), ('wait', 50), ('tim', 47)]


In [37]:
# Using Snowball Stemmer
snow_stemmed_words = [snow_st.stem(word) for word in b1_review_words]
snow_freq = Counter(snow_stemmed_words)
snow_common = snow_freq.most_common(10)
print(snow_common)

[('steak', 105), ('nt', 75), ('great', 71), ('food', 70), ('order', 62), ('good', 62), ('servic', 56), ('place', 50), ('time', 47), ('seat', 44)]


### POS Tagging

In [38]:
random_sentences = reviews.sample(5, random_state=42)
random_sentences = list(random_sentences['text'])

In [39]:
random_sentences[0]

"Que ce soit pour leurs délicieux bubbles tea/smooties, leurs ''Bánh mì'' , leurs petits snacks (viennoiseries, tapioca, ...), on adore Vua et aussi leurs prix très abordables. On y retourne lorsqu'on est dans le Quartier Latin !"

In [40]:
nltk_tagged = []
for sentence in random_sentences:
    nltk_tagged.append((nltk.pos_tag(word_tokenize(sentence))))
nltk_tagged

[[('Que', 'NNP'),
  ('ce', 'NN'),
  ('soit', 'VBD'),
  ('pour', 'JJ'),
  ('leurs', 'NNS'),
  ('délicieux', 'VBP'),
  ('bubbles', 'NNS'),
  ('tea/smooties', 'NNS'),
  (',', ','),
  ('leurs', 'VBZ'),
  ('``', '``'),
  ('Bánh', 'NNP'),
  ('mì', 'NN'),
  ("''", "''"),
  (',', ','),
  ('leurs', 'VBZ'),
  ('petits', 'NNS'),
  ('snacks', 'NNS'),
  ('(', '('),
  ('viennoiseries', 'NNS'),
  (',', ','),
  ('tapioca', 'NN'),
  (',', ','),
  ('...', ':'),
  (')', ')'),
  (',', ','),
  ('on', 'IN'),
  ('adore', 'IN'),
  ('Vua', 'NNP'),
  ('et', 'CC'),
  ('aussi', 'JJ'),
  ('leurs', 'NNS'),
  ('prix', 'VBP'),
  ('très', 'JJ'),
  ('abordables', 'NNS'),
  ('.', '.'),
  ('On', 'IN'),
  ('y', 'JJ'),
  ('retourne', 'JJ'),
  ("lorsqu'on", 'NN'),
  ('est', 'JJS'),
  ('dans', 'NNS'),
  ('le', 'VBP'),
  ('Quartier', 'NNP'),
  ('Latin', 'NNP'),
  ('!', '.')],
 [('As', 'IN'),
  ('I', 'PRP'),
  ("'ve", 'VBP'),
  ('said', 'VBD'),
  ('previously', 'RB'),
  ('...', ':'),
  ('we', 'PRP'),
  ("'ve", 'VBP'),
  ('been

In [93]:
nlp = spacy.load("en_core_web_sm")
spacy_tagged = []
for sentence in random_sentences:
    spacy_tagged.append(nlp(sentence))
for tagged in spacy_tagged:
    for token in tagged:
        print(f'{token.text:{8}} {token.pos_:{6}}')

Que      PROPN , NNP
ce       PROPN , NNP
soit     ADJ   , JJ
pour     NOUN  , NN
leurs    VERB  , VBZ
délicieux NOUN  , NN
bubbles  NOUN  , NNS
tea      NOUN  , NN
/        SYM   , SYM
smooties NOUN  , NNS
,        PUNCT , ,
leurs    VERB  , VBZ
''       PUNCT , ''
Bánh     PROPN , NNP
mì       INTJ  , UH
''       PUNCT , ''
,        PUNCT , ,
leurs    NOUN  , NNS
petits   VERB  , VBZ
snacks   NOUN  , NNS
(        PUNCT , -LRB-
viennoiseries NOUN  , NNS
,        PUNCT , ,
tapioca  INTJ  , UH
,        PUNCT , ,
...      PUNCT , NFP
)        PUNCT , -RRB-
,        PUNCT , ,
on       ADP   , IN
adore    PROPN , NNP
Vua      PROPN , NNP
et       PROPN , NNP
aussi    PROPN , NNP
leurs    VERB  , VBZ
prix     NOUN  , NN
très     ADJ   , JJ
abordables NOUN  , NNS
.        PUNCT , .
On       ADP   , IN
y        PROPN , NNP
retourne VERB  , VBN
lorsqu'on PROPN , NNP
est      PROPN , NNP
dans     PROPN , NNPS
le       X     , FW
Quartier PROPN , NNP
Latin    PROPN , NNP
!        PUNCT , .
As   

# WORK COMPLETED UP TILL HERE.

### Writing Style

In [42]:
def getSOFurl():
    while(True):
        ip0 = 'stackoverflow'
        ip1 = 'com'
        ip2 = 'questions'
        ip3 = str(random.randint(0, 100000000))
        url = 'https://' + ip0 + '.' + ip1 + '/'+ ip2 + '/'+ ip3
        try:
            urlContent = urlopen(url).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
            pass

    print("Found URL: " + url)
    page1 = requests.get(url)
    soup1 = BeautifulSoup(page1.content, "html.parser")
    text = list(soup1.find_all("p"))
    sof_text = [txt.get_text() for txt in text]
   
    return sof_text


In [43]:
def getHWZurl():
    url = 'https://www.hardwarezone.com.sg/home'
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    urls = []
    for link in soup.find_all('a'):
        urls.append(link.get('href'))
    random.shuffle(urls)
    while(True):
        for url in urls:
            print(url)
            try:
                url = 'https://www.hardwarezone.com.sg' + url
                urlContent = urlopen(url).read()
                page1 = requests.get(url)
                soup1 = BeautifulSoup(page1.content, "html.parser")
                text = list(soup1.find_all("p"))
                hwz_text = [txt.get_text() for txt in text]
                if urlContent and len(hwz_text)>10:
                    break
            except Exception as e: 
                print(e)
        break
    print("Found URL: " + url)
    return hwz_text

In [48]:
from urllib.parse import urlparse, urljoin
def getCNAurl():
    url = 'https://www.channelnewsasia.com/'
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    linked_urls = set()

    def get_all_links(url):
        urls = set()
        domain_name = urlparse(url).netloc
        soup = BeautifulSoup(requests.get(url).content, "html.parser")
        for a_tag in soup.findAll("a"):
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                # href empty tag
                continue
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            # remove URL GET parameters, URL fragments, etc.
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if href in urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
                continue
            if len(href) > 60:
                urls.add(href)
        return urls
    
    urls_cna = get_all_links('https://www.channelnewsasia.com/')
    url = random.sample(urls_cna, 1)
    print("Found URL: ", url[0])
    
    page1 = requests.get(url[0])
    soup1 = BeautifulSoup(page1.content, "html.parser")
    text = list(soup1.find_all("p"))
    cna_text = [txt.get_text() for txt in text]
   
    return cna_text


#### Cleaning text data

In [45]:
sof_text1 = getSOFurl()
sof_text2 = getSOFurl()

Found URL: https://stackoverflow.com/questions/67452202
Found URL: https://stackoverflow.com/questions/43866776


In [46]:
hwz_text1 = getHWZurl()
hwz_text2 = getHWZurl()

javascript:openDkSideNav()
nonnumeric port: 'openDkSideNav()'
/tech-news-tom-hanks-apple-tv-plus-apocalyptic-finch
Found URL: https://www.hardwarezone.com.sg/tech-news-tom-hanks-apple-tv-plus-apocalyptic-finch
javascript:void(0)
nonnumeric port: 'void(0)'
/product-guide/211-coolers/home
/product-guide/404-internet-cloud-ecommerce/home
/product-guide/403-apps-software/home
/home
/feature-nia-dacostas-candyman-beautifully-presented-falls-just-short-perfect-score
Found URL: https://www.hardwarezone.com.sg/feature-nia-dacostas-candyman-beautifully-presented-falls-just-short-perfect-score


In [49]:
cna_text1 = getCNAurl()
cna_text2 = getCNAurl()

since Python 3.9 and will be removed in a subsequent version.
  url = random.sample(urls_cna, 1)


Found URL:  https://www.channelnewsasia.com/business/exclusive-india-likely-block-chinese-investment-insurance-giant-lics-ipo-sources-2194451
Found URL:  https://www.channelnewsasia.com/business/eu-court-backs-altices-million-euro-fine-gun-jumping-merger-deal-2194441


##### Cleaning SOF text data

In [50]:
sof_text1 = sof_text1[4:]
sof_text1 = sof_text1[:-10]
temp_list = []
for line in sof_text1:
    temp_list += sent_tokenize(line)
sof_text1 = temp_list

In [51]:
sof_text1

['I have a method which returns pointer of one of three relative classes:',
 'And I need to create an object exactly of the class, which class` pointer method returns',
 'I tryed the line typeid(root->search_by_name(process_object)) myobject;But it`s obviously silly.',
 'Could you advise something?',
 "It's not possible to derive the static type of the polymorphic object so as to make a static declaration of type as in:",
 'There are a couple of things you could do.',
 'One is to put a clone() or create() style function in your polymorphic classes to return correctly typed dynamic objects.',
 'The other would be to manually query the possible types, which rather defeats the point of polymorphism:',
 'NOTE: Pointers used for exposition only, use std::unique_ptr etc.. in real code.']

In [52]:
sof_text2 = sof_text2[4:]
sof_text2 = sof_text2[:-10]
temp_list = []
for line in sof_text2:
    temp_list += sent_tokenize(line)
sof_text2 = temp_list

##### Cleaning CNA text data

In [53]:
cna_text1

['\n\n      Business\n  \n',
 '\n\n      Business\n  \n',
 'NEW DELHI :     New Delhi wants to block Chinese investors from buying shares in Indian insurance giant Life Insurance Corp (LIC) which is due to go public, four senior government officials and a banker told Reuters, underscoring tensions between the two nations.',
 "State-owned LIC is considered a strategic asset, commanding more than 60per cent of India's life insurance market with assets of more than US$500 billion. While the government is planning to allow foreign investors to participate in what is likely to be the country's biggest-ever IPO worth a potential US$12.2 billion, it is leery of Chinese ownership, the sources said.",
 'Political tensions between the countries rocketed last year after their soldiers clashed on the disputed Himalayan border and since then, India has sought to limit Chinese investment in sensitive companies and sectors, banned a raft of Chinese mobile apps and subjected imports of Chinese goods t

##### Cleaning HWZ text data

In [54]:
hwz_text1 = hwz_text1[:-5]
temp_list = []
for line in hwz_text1:
    temp_list += sent_tokenize(line)
hwz_text1 = temp_list

In [55]:
hwz_text2 = hwz_text2[:-5]
temp_list = []
for line in hwz_text2:
    temp_list += sent_tokenize(line)
hwz_text2 = temp_list

##### First word in sentence capitalized?

In [62]:
count=0
uppercount=0
for sent in sof_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for sof_text1: ", uppercount/count)

Fraction of first letter being capitalised for sof_text1:  1.0


['I have a method which returns pointer of one of three relative classes:',
 'And I need to create an object exactly of the class, which class` pointer method returns',
 'I tryed the line typeid(root->search_by_name(process_object)) myobject;But it`s obviously silly.',
 'Could you advise something?',
 "It's not possible to derive the static type of the polymorphic object so as to make a static declaration of type as in:",
 'There are a couple of things you could do.',
 'One is to put a clone() or create() style function in your polymorphic classes to return correctly typed dynamic objects.',
 'The other would be to manually query the possible types, which rather defeats the point of polymorphism:',
 'NOTE: Pointers used for exposition only, use std::unique_ptr etc.. in real code.']

In [63]:
count=0
uppercount=0
for sent in sof_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for sof_text2: ", uppercount/count)

Fraction of first letter being capitalised for sof_text2:  0.6666666666666666


['I found on this site solution about move legend in fieldset from border to inside fieldset.',
 'http://codepen.io/vkjgr/pen/oFdBa When I want to do this i should use float left style on legend, but I should use clear both after legend ?',
 'This solution works on all browsers ?',
 'When I want clean I must use only clear both ?',
 'If you want to follow your own approach, it also works as shown below.',
 'Otherwise, width:100% on legend will also do.',
 'fieldset {\r\n  padding: 2em;\r\n}\r\n\r\nlegend {\r\n  float: left;\r\n}\r\n\r\n.clear {\r\n  clear: both;\r\n}\n<fieldset>\r\n  <legend>\r\n    Hello\r\n  </legend>\r\n  <div class="clear">\r\n\r\n  </div>\r\n  <p>\r\n    Lorem ipsum dolor sit amet, consectetur adipisicing elit.',
 'Neque sunt ad facilis fugiat perferendis et fugit quas, accusantium, quod atque provident natus facere animi sed accusamus qui doloribus illo nesciunt.',
 '</p>\r\n</fieldset>']

In [58]:
count=0
uppercount=0
for sent in hwz_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for hwz_text1: ", uppercount/count)

Fraction of first letter being capitalised for hwz_text1:  1.0


In [59]:
count=0
uppercount=0
for sent in hwz_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for hwz_text2: ", uppercount/count)

Fraction of first letter being capitalised for hwz_text2:  1.0


In [60]:
count=0
uppercount=0
for sent in cna_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for cna_text1: ", uppercount/count)

IndexError: string index out of range

##### Length of articles?

In [61]:
print("No of sentences in sof_text1: ", len(sof_text1))
print("No of sentences in sof_text2: ", len(sof_text2))
print("No of sentences in hwz_text1: ", len(hwz_text1))
print("No of sentences in hwz_text2: ", len(hwz_text2))

No of sentences in sof_text1:  9
No of sentences in sof_text2:  9
No of sentences in hwz_text1:  9
No of sentences in hwz_text2:  50


##### Proper nouns capitalised?

In [64]:
tagged = []
uppercount = 0
count = 0
for sentence in sof_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in sof_text1: ', uppercount/count)

Fraction of proper nouns capitalised in sof_text1:  0.0


In [65]:
tagged = []
uppercount = 0
count = 0
for sentence in sof_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in sof_text2: ', uppercount/count)

Fraction of proper nouns capitalised in sof_text2:  0.1111111111111111


In [66]:
tagged = []
uppercount = 0
count = 0
for sentence in hwz_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in hwz_text1: ', uppercount/count)

Fraction of proper nouns capitalised in hwz_text1:  1.0


In [67]:
tagged = []
uppercount = 0
count = 0
for sentence in hwz_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in hwz_text2: ', uppercount/count)

Fraction of proper nouns capitalised in hwz_text2:  0.958904109589041


###### What kind of proper nouns used?

1. Stack Overflow

In [68]:
sof_tagged = []
sof_noun_dict = {}
for sentence in sof_text1:
    sof_tagged.append(nlp(sentence))
for sentence in sof_text2:
    sof_tagged.append(nlp(sentence))
for tagged in sof_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in sof_noun_dict.keys():
                sof_noun_dict[token.text] += 1
            else:
                sof_noun_dict[token.text] = 1

sof_noun_dict_sorted = sorted(sof_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in sof_noun_dict_sorted:
    print(i[0], i[1])

legend 6
method 2
pointer 2
classes 2
object 2
class 2
type 2
style 2
solution 2
fieldset 2
float 2
line 1
typeid(root->search_by_name(process_object 1
myobject;But 1
it`s 1
declaration 1
couple 1
things 1
clone 1
function 1
objects 1
types 1
point 1
polymorphism 1
NOTE 1
Pointers 1
exposition 1
std::unique_ptr 1
code 1
site 1
move 1
border 1
browsers 1
approach 1
width:100 1
% 1
padding 1
2em 1
Hello 1
div 1
class="clear 1
/div 1
p 1
Lorem 1
ipsum 1
dolor 1
amet 1
consectetur 1
adipisicing 1
elit 1
Neque 1
sunt 1
ad 1
facilis 1
fugiat 1
et 1
fugit 1
quas 1
accusantium 1
quod 1
atque 1
natus 1
facere 1
animi 1
accusamus 1
qui 1
doloribus 1
illo 1
nesciunt 1
/p 1


In [69]:
hwz_tagged = []
hwz_noun_dict = {}
for sentence in hwz_text1:
    hwz_tagged.append(nlp(sentence))
for sentence in hwz_text2:
    hwz_tagged.append(nlp(sentence))
for tagged in hwz_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in hwz_noun_dict.keys():
                hwz_noun_dict[token.text] += 1
            else:
                hwz_noun_dict[token.text] = 1

hwz_noun_dict_sorted = sorted(hwz_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in hwz_noun_dict_sorted:
    print(i[0], i[1])

Candyman 23
movie 16
film 9
Anthony 9
time 6
story 6
presentation 5
man 5
Finch 4
slasher 4
legend 4
hook 4
narrative 4
Brianna 4
exhibition 4
Tom 3
Goodyear 3
horror 3
DaCosta 3
injustice 3
day 3
hand 3
things 3
show 3
pacing 3
environments 3
Apple 2
content 2
Hanks 2
Jeff 2
death 2
trailer 2
world 2
Aftershock 2
custom 2
PC 2
curve 2
way 2
sequel 2
fact 2
flick 2
point 2
mirrors 2
gore 2
series 2
discrimination 2
artist 2
Cabrini 2
Green 2
art 2
ritual 2
origins 2
fans 2
background 2
twist 2
one 2
information 2
head 2
end 2
flow 2
example 2
Netflix 1
strategy 1
laser 1
award 1
titles 1
scale 1
service 1
Greyhound 1
Sony 1
year 1
robotics 1
engineer 1
Earth 1
surface 1
climate 1
flare 1
sun 1
companion 1
DIY 1
robot 1
heart 1
determination 1
caretaker 1
trio 1
journey 1
storm 1
safety 1
road 1
trip 1
challenges 1
humor 1
dangers 1
streaming 1
TV+ 1
November 1
record 1
films 1
prejudices 1
Nia 1
matter 1
component 1
view 1
hooks 1
theme 1
use 1
shadow 1
puppetry 1
audiences 1
concept 1

##### Good grammar?
1. Subject-verb agreement
2. Tense matching

#### Subject-verb agreement

In [None]:
doc = nlp("There are a lot of Korean dishes in the menu")
displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



In [85]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
def is_passive(sentence):
    doc = nlp(sentence)
    dict1 = {'DEP': 'nsubjpass'}
    dict2 = {'DEP': 'aux', 'OP': '*'}
    dict3 = {'DEP': 'auxpass'}
    dict4 = {'TAG': 'VBN'}
    passive_rule = [dict1, dict2, dict3, dict4]
    matcher.add("Passive", [passive_rule])
    matches = matcher(doc)
    if matches:
        return True
    else:
        return False

https://github.com/armsp/active_or_passive/blob/master/spacy_voices.py

### Short break to count the % of passive sentences in the text

In [91]:
sof_text1_passivecounts = 0
sof_text2_passivecounts = 0 

for sent in sof_text1:
    if is_passive(sent):
        sof_text1_passivecounts += 1
for sent in sof_text2:
    if is_passive(sent):
        sof_text2_passivecounts += 1

print("% of passive sentences in sof_text1: ", sof_text1_passivecounts/len(sof_text1))
print("% of passive sentences in sof_text2: ", sof_text2_passivecounts/len(sof_text2))

% of passive sentences in sof_text1:  0.0
% of passive sentences in sof_text2:  0.0


In [92]:
hwz_text1_passivecounts = 0
hwz_text2_passivecounts = 0 

for sent in hwz_text1:
    if is_passive(sent):
        hwz_text1_passivecounts += 1
for sent in hwz_text2:
    if is_passive(sent):
        hwz_text2_passivecounts += 1

print("% of passive sentences in hwz_text1: ", hwz_text1_passivecounts/len(hwz_text1))
print("% of passive sentences in hwz_text2: ", hwz_text2_passivecounts/len(hwz_text2))

% of passive sentences in hwz_text1:  0.1111111111111111
% of passive sentences in hwz_text2:  0.1


# Ok back to business

In [None]:
for sent in sof_text1:
    if not is_passive(sent):
        sent_tagged = nlp(sent)
        for word, tag in sent_tagged:
            if tag == 'NNS' or tag == 'AUX':
                
        
for sent in sof_text2:
    if is_passive(sent):
        

# HELP!

# So i learnt that this way is not good. i will attempt to try with spacy dependency tracker and then combine with nltk pos tagger to check if there is SVA. thx

# this is what i will do
# 1. choose 6 good urls (2 from each source) so that there can be comparisons
# 2. do the CNA preprocessing part
# 3. continue with the two parts for checking for good grammar (any more ideas?)
# 4. think n implement more things to analyse for writing style
