In [3]:
# Import the libraries needed
from bs4 import BeautifulSoup, SoupStrainer
from collections import Counter
# import httplib2
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd 
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
import urllib.request
from urllib.request import urlopen, Request
import random
import re
import requests

In [4]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [5]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

## 3.2 Dataset Analysis

### Tokenisation and Stemming

In [6]:
# get reviews for a random business 
random_business = reviews.sample()
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
9308,ogULNfkkK9m0tsAf3uTpYg,EBLSMfHBAxg5wu-PHY9bdA,QxRKQ_Lzu4lMOejwVgcdAg,4,0,0,0,Their lounge fries are fantastic. Always get t...,2017-10-02 02:10:28
9316,dvGmTwBkKMHSfxUteJ1HMA,KeQpOId8uJhlbU_Jzv9Tew,QxRKQ_Lzu4lMOejwVgcdAg,5,0,0,0,My co-workers and I come here for lunch severa...,2017-01-14 21:26:31
9327,2Cii2hAnBi3_crTcRbLGNw,7nIzgNF7YzI-UAyYr3y9hw,QxRKQ_Lzu4lMOejwVgcdAg,4,0,0,0,NPL is a great restaurant and I go there weekl...,2013-09-04 04:02:47
9343,gciRG7KNEiAcNR5D9xYAGA,HdRdLgvD34CeCd7UAS5iEw,QxRKQ_Lzu4lMOejwVgcdAg,5,0,0,0,We have eaten here for years and really enjoy ...,2018-02-11 04:03:44
9352,TJlJkT5dJOofomrcvGDhVw,1wpILrjIBzJ5wUAa0_5TBQ,QxRKQ_Lzu4lMOejwVgcdAg,2,0,0,0,NPL is very close to home and I have come here...,2012-11-06 20:29:53


In [7]:
small_business_dataset_reviews = list(small_business_dataset['text'])

In [8]:
# convert the reviews into a concatenated string 
b1_review = ''.join(small_business_dataset_reviews)
clean_review = re.sub(r"[^A-Za-z0-9\s]+", "", b1_review)
b1_review = nlp(clean_review)

In [9]:
# removed punctuation and get the top 10 most common words (including stopwords)
b1_review_words = [token.text for token in b1_review if token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('the', 370), ('and', 356), ('a', 303), ('I', 206), ('to', 205), ('was', 184), ('of', 135), ('is', 133), ('for', 114), ('The', 99)]


In [10]:
# removed punctuation and get the top 10 most common words (excluding stopwords)
b1_review_words = [token.text for token in b1_review if token.is_stop != True and token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('food', 69), ('good', 68), ('nt', 68), ('place', 57), ('fries', 45), ('bar', 37), ('like', 33), ('great', 32), ('fish', 32), ('sandwich', 29)]


In [11]:
#TODO: plot log graph


In [12]:
# now we do some stemming after removing the stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

porter_st = PorterStemmer()
lancaster_st = LancasterStemmer()
snow_st = SnowballStemmer("english")

In [13]:
# Using Porter Stemmer
porter_stemmed_words = [porter_st.stem(word) for word in b1_review_words]
porter_freq = Counter(porter_stemmed_words)
porter_common = porter_freq.most_common(10)
print(porter_common)

[('food', 73), ('good', 72), ('nt', 68), ('place', 68), ('fri', 60), ('order', 49), ('sandwich', 43), ('great', 42), ('like', 41), ('bar', 39)]


In [14]:
# Using Lancaster Stemmer
lancaster_stemmed_words = [lancaster_st.stem(word) for word in b1_review_words]
lancaster_freq = Counter(lancaster_stemmed_words)
lancaster_common = lancaster_freq.most_common(10)
print(lancaster_common)

[('food', 73), ('good', 72), ('nt', 68), ('plac', 68), ('fri', 60), ('serv', 53), ('ord', 49), ('sandwich', 43), ('gre', 42), ('lik', 41)]


In [15]:
# Using Snowball Stemmer
snow_stemmed_words = [snow_st.stem(word) for word in b1_review_words]
snow_freq = Counter(snow_stemmed_words)
snow_common = snow_freq.most_common(10)
print(snow_common)

[('food', 73), ('good', 72), ('nt', 68), ('place', 68), ('fri', 60), ('order', 49), ('sandwich', 43), ('great', 42), ('like', 41), ('bar', 39)]


### POS Tagging

In [16]:
random_sentences = reviews.sample(5, random_state=42)
random_sentences = list(random_sentences['text'])

In [17]:
random_sentences[0]

"Que ce soit pour leurs délicieux bubbles tea/smooties, leurs ''Bánh mì'' , leurs petits snacks (viennoiseries, tapioca, ...), on adore Vua et aussi leurs prix très abordables. On y retourne lorsqu'on est dans le Quartier Latin !"

In [18]:
nltk_tagged = []
for sentence in random_sentences:
    nltk_tagged.append((nltk.pos_tag(word_tokenize(sentence))))
nltk_tagged

[[('Que', 'NNP'),
  ('ce', 'NN'),
  ('soit', 'VBD'),
  ('pour', 'JJ'),
  ('leurs', 'NNS'),
  ('délicieux', 'VBP'),
  ('bubbles', 'NNS'),
  ('tea/smooties', 'NNS'),
  (',', ','),
  ('leurs', 'VBZ'),
  ('``', '``'),
  ('Bánh', 'NNP'),
  ('mì', 'NN'),
  ("''", "''"),
  (',', ','),
  ('leurs', 'VBZ'),
  ('petits', 'NNS'),
  ('snacks', 'NNS'),
  ('(', '('),
  ('viennoiseries', 'NNS'),
  (',', ','),
  ('tapioca', 'NN'),
  (',', ','),
  ('...', ':'),
  (')', ')'),
  (',', ','),
  ('on', 'IN'),
  ('adore', 'IN'),
  ('Vua', 'NNP'),
  ('et', 'CC'),
  ('aussi', 'JJ'),
  ('leurs', 'NNS'),
  ('prix', 'VBP'),
  ('très', 'JJ'),
  ('abordables', 'NNS'),
  ('.', '.'),
  ('On', 'IN'),
  ('y', 'JJ'),
  ('retourne', 'JJ'),
  ("lorsqu'on", 'NN'),
  ('est', 'JJS'),
  ('dans', 'NNS'),
  ('le', 'VBP'),
  ('Quartier', 'NNP'),
  ('Latin', 'NNP'),
  ('!', '.')],
 [('As', 'IN'),
  ('I', 'PRP'),
  ("'ve", 'VBP'),
  ('said', 'VBD'),
  ('previously', 'RB'),
  ('...', ':'),
  ('we', 'PRP'),
  ("'ve", 'VBP'),
  ('been

In [19]:
nlp = spacy.load("en_core_web_sm")
spacy_tagged = []
for sentence in random_sentences:
    spacy_tagged.append(nlp(sentence))
for tagged in spacy_tagged:
    for token in tagged:
        print(f'{token.text:{8}} {token.pos_:{6}}')

Que      PROPN 
ce       PROPN 
soit     PROPN 
pour     PROPN 
leurs    VERB  
délicieux NOUN  
bubbles  NOUN  
tea      NOUN  
/        SYM   
smooties NOUN  
,        PUNCT 
leurs    NOUN  
'        PUNCT 
'        PUNCT 
Bánh     PROPN 
mì       PROPN 
'        PUNCT 
'        PUNCT 
,        PUNCT 
leurs    VERB  
petits   NOUN  
snacks   NOUN  
(        PUNCT 
viennoiseries NOUN  
,        PUNCT 
tapioca  NOUN  
,        PUNCT 
...      PUNCT 
)        PUNCT 
,        PUNCT 
on       ADP   
adore    PROPN 
Vua      PROPN 
et       PROPN 
aussi    PROPN 
leurs    PROPN 
prix     PROPN 
très     PROPN 
abordables NOUN  
.        PUNCT 
On       ADP   
y        PROPN 
retourne PROPN 
lorsqu'on PROPN 
est      PROPN 
dans     NOUN  
le       VERB  
Quartier PROPN 
Latin    PROPN 
!        PUNCT 
As       SCONJ 
I        PRON  
've      AUX   
said     VERB  
previously ADV   
...      PUNCT 
we've    PROPN 
been     AUX   
coming   VERB  
to       ADP   
LMAH     PROPN 
for      ADP 

# WORK COMPLETED UP TILL HERE.

### Writing Style

#### Getting SOF article data (originally random, source code in get_urls.py)

In [20]:
page1 = requests.get('https://stackoverflow.com/questions/5652693')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
sof_text1 = [txt.get_text() for txt in text]

In [21]:
page1 = requests.get('https://stackoverflow.com/questions/52832519')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
sof_text2 = [txt.get_text() for txt in text]

#### Getting HWZ article data (originally random, source code in get_urls.py)

In [22]:
page1 = requests.get('https://www.hardwarezone.com.sg/feature-apple-watchos-8-takes-first-serious-step-outdoor-cycling-fitness-workout')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
hwz_text1 = [txt.get_text() for txt in text]

In [23]:
page1 = requests.get('https://www.hardwarezone.com.sg/feature-apple-iphone-13-iphone-13-pro-2021-review-singapore-price-specs')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
hwz_text2 = [txt.get_text() for txt in text]

#### Getting CNA article data (originally random, source code in get_urls.py)

In [24]:
page1 = requests.get('https://www.channelnewsasia.com/singapore/de-beers-wong-tian-jun-sugar-daddy-appeal-sentence-2199481')
soup1 = BeautifulSoup(page1.content, 'html.parser')
text = list(soup1.find_all('p'))
print(text)
cna_text1 = [txt.get_text() for txt in text]
cna_text1

[]


[]

In [25]:
page1 = requests.get('https://www.channelnewsasia.com/singapore/de-beers-wong-tian-jun-sugar-daddy-appeal-sentence-2199481')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
cna_text2 = [txt.get_text() for txt in text]

##### Cleaning SOF text data

In [26]:
sof_text1 = sof_text1[4:]
sof_text1 = sof_text1[:-10]
temp_list = []
for line in sof_text1:
    temp_list += sent_tokenize(line)
sof_text1 = temp_list

In [27]:
sof_text1

["Let's say I have the XQuery code below:",
 'Can I use a counter, to count how many my code will enter inside the second for loop?',
 'I tried this:',
 'but I got compile errors.',
 'I also I need to sum some constraints like this:',
 "but of course this didn't work either :(.",
 'Any suggestions will be highly appreciated.',
 'Also, can you suggest a good book/tutorial/site for doing such stuff?',
 "You don't need it very often, but if you really do need a counter variable inside an XQuery for expression (analogous to position() in an xsl:for-each) you can use an at variable",
 "I don't think you have understood the basics of declarative paradigm.",
 'Total count and sum would be:',
 'Partial count and sum would be:',
 'To Display counter in loop the best and easier way is to add at $pos in your for loop.',
 '$pos will work as a counter.',
 'Code Snippet :',
 'Output:',
 "Here is another solution, if you don't want to count or act on every item of a collection, but you want to count/

In [28]:
sof_text2 = sof_text2[4:]
sof_text2 = sof_text2[:-10]
temp_list = []
for line in sof_text2:
    temp_list += sent_tokenize(line)
sof_text2 = temp_list

In [29]:
sof_text2

['In Oracle SQL Developer 3.1.07 The two subqueries below run correctly on their own.',
 "All I'm trying to do is union them together but Oracle gives an ORA-12704: charcter set mismatch error.",
 "I have checked similar posts to check the UNION syntax and I don't understand where the error is coming from.",
 'The select statements are selecting fields from the same columns in both queries so there should be no data type problems.',
 'Help would be greatly appreciated!',
 'Ok so credit to my manager Mr K who actually solved this.',
 'The problem was with the THIRDPARTY column which is the same data type in both subqueries.',
 'In order to solve the problem we used CAST (field AS VARCHAR(3)) in both subqueries.',
 'We identified the problem field by stripping all but the first field out of the queries and then adding them back in one by one until it broke.',
 "What we still don't understand is why this error occured so any information on this would still be apprieciated.",
 'Here is the

##### Cleaning CNA text data

In [30]:
cna_text1 = cna_text1[2:]
cna_text1 = cna_text1[:-7]
temp_list = []
for line in cna_text1:
    temp_list += sent_tokenize(line)
cna_text1 = temp_list

In [31]:
cna_text1

[]

In [32]:
cna_text2 = cna_text2[2:]
cna_text2 = cna_text2[:-8]
temp_list = []
for line in cna_text2:
    temp_list += sent_tokenize(line)
cna_text2 = temp_list

In [33]:
cna_text2

[]

##### Cleaning HWZ text data

In [34]:
hwz_text1 = hwz_text1[:-5]
temp_list = []
for line in hwz_text1:
    temp_list += sent_tokenize(line)
hwz_text1 = temp_list

In [35]:
hwz_text1

['When Apple revealed at its California Streaming event that its watchOS 8 update (available now) would bring with it some major quality of life upgrades for its Outdoor Cycle workout, cyclists like me sat up.',
 'Up until now, tracking features on the Apple watches for cycling have been pretty limited - functions like auto-pause and ride detection that most of us cyclists take for granted on our Garmin or Wahoo computers are absent.',
 'But watchOS 8 will change all of these and then some.',
 'That Apple showed and focused on a mountain biker during the presentation is a watershed moment for cyclists using an Apple Watch.',
 'It’s possibly the only major fitness segment that the company hasn’t quite dominate like it did with other mainstream fitness areas like jogging and yoga.',
 'With cycling facing its biggest “bike boom” during the pandemic (be prepared to wait for months if you’re looking to get a foldie or a roadbike now), it’s timely for Apple to jazz up the latest watchOS with

In [36]:
hwz_text2 = hwz_text2[:-5]
temp_list = []
for line in hwz_text2:
    temp_list += sent_tokenize(line)
hwz_text2 = temp_list

In [37]:
hwz_text2

["Note: This review was first published on 21 September 2021 and it's republished now because the new iPhones are available in retail.",
 'Despite reservations about the name, Apple has stuck with what’s sensible and went with the number 13 for their latest iPhones.',
 'You might have already heard, Apple’s newest iPhones are the iPhone 13 and iPhone 13 Mini, and the iPhone 13 Pro and iPhone 13 Pro Max .',
 'No surprises, there.',
 'However, there are some profound changes compared to last year’s lineup and that could affect your purchasing decision.',
 'Sit down, grab a cuppa, this is a long one.',
 'Last year’s iPhones\xa0got\xa0a major redesign so we knew we weren’t going to be getting phones that look drastically different this year.',
 'The basic recipe for this year’s iPhones is nearly identical to last year’s models.',
 'In fact, unless you know what to look out for, you’d be hard-pressed to tell the difference between this year’s models and last year’s.',
 'They all have flat s

##### First word in sentence capitalized?

In [140]:
def first_word_cap(text):
    count=0
    uppercount=0
    for sent in text:
        if sent[0].isupper():
            uppercount+=1
        count+=1
    return uppercount/count

In [144]:
print("Fraction of first letter being capitalised for sof_text1: ", first_word_cap(sof_text1))
print("Fraction of first letter being capitalised for sof_text1: ", first_word_cap(sof_text2))
print("Fraction of first letter being capitalised for hwz_text1: ", first_word_cap(hwz_text1))
print("Fraction of first letter being capitalised for hwz_text2: ", first_word_cap(hwz_text2))
print("Fraction of first letter being capitalised for cna_text1: ", first_word_cap(cna_text1))
print("Fraction of first letter being capitalised for cna_text2: ", first_word_cap(cna_text2))

Fraction of first letter being capitalised for sof_text1:  0.8095238095238095
Fraction of first letter being capitalised for sof_text1:  1.0


IndexError: string index out of range

##### Length of articles?

In [97]:
print("No of sentences in sof_text1: ", len(sof_text1))
print("No of sentences in sof_text2: ", len(sof_text2))
print("No of sentences in hwz_text1: ", len(hwz_text1))
print("No of sentences in hwz_text2: ", len(hwz_text2))
print("No of sentences in cna_text1: ", len(cna_text1))
print("No of sentences in cna_text2: ", len(cna_text2))

No of sentences in sof_text1:  15
No of sentences in sof_text2:  9
No of sentences in hwz_text1:  37
No of sentences in hwz_text2:  52
No of sentences in cna_text1:  0
No of sentences in cna_text2:  0


##### Proper nouns capitalised?

In [146]:
def prop_nouns_cap(text):
    tagged = []
    uppercount = 0
    count = 0
    for sentence in text:
        tagged.append(nlp(sentence))
    for tag in tagged:
        for token in tag:
            if token.pos_ == 'PROPN':
                if token.text[0].isupper():
                    uppercount += 1
                count += 1
    return uppercount/count

In [150]:
print('Fraction of proper nouns capitalised in sof_text1: ', prop_nouns_cap(sof_text1))
print('Fraction of proper nouns capitalised in sof_text2: ', prop_nouns_cap(sof_text2))
print('Fraction of proper nouns capitalised in hwz_text1: ', prop_nouns_cap(hwz_text1))
print('Fraction of proper nouns capitalised in hwz_text2: ', prop_nouns_cap(hwz_text2))
print('Fraction of proper nouns capitalised in cna_text1: ', prop_nouns_cap(cna_text1))
print('Fraction of proper nouns capitalised in cna_text2: ', prop_nouns_cap(cna_text2))

Fraction of proper nouns capitalised in sof_text1:  0.8888888888888888
Fraction of proper nouns capitalised in sof_text1:  0.8571428571428571
Fraction of proper nouns capitalised in sof_text1:  0.8513513513513513
Fraction of proper nouns capitalised in sof_text1:  0.7191011235955056


ZeroDivisionError: division by zero

###### What kind of proper nouns used?

1. Stack Overflow

In [151]:
sof_tagged = []
sof_noun_dict = {}
for sentence in sof_text1:
    sof_tagged.append(nlp(sentence))
for sentence in sof_text2:
    sof_tagged.append(nlp(sentence))
for tagged in sof_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in sof_noun_dict.keys():
                sof_noun_dict[token.text] += 1
            else:
                sof_noun_dict[token.text] = 1

sof_noun_dict_sorted = sorted(sof_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in sof_noun_dict_sorted:
    print(i[0], i[1])

XQuery 4
counter 4
code 3
loop 3
subqueries 3
error 3
problem 3
field 3
variable 2
expression 2
paradigm 2
count 2
sum 2
way 2
pos 2
solution 2
Oracle 2
queries 2
data 2
type 2
compile 1
errors 1
constraints 1
course 1
suggestions 1
book 1
tutorial 1
site 1
stuff 1
position 1
xsl 1
basics 1
Display 1
Code 1
Snippet 1
Output 1
item 1
collection 1
condition 1
case 1
programming 1
counting 1
function 1
calls 1
usage 1
XPath 1
SQL 1
Developer 1
3.1.07 1
run 1
charcter 1
set 1
mismatch 1
posts 1
UNION 1
syntax 1
statements 1
fields 1
columns 1
problems 1
Help 1
credit 1
manager 1
Mr 1
K 1
THIRDPARTY 1
column 1
order 1
CAST 1
information 1


In [152]:
hwz_tagged = []
hwz_noun_dict = {}
for sentence in hwz_text1:
    hwz_tagged.append(nlp(sentence))
for sentence in hwz_text2:
    hwz_tagged.append(nlp(sentence))
for tagged in hwz_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in hwz_noun_dict.keys():
                hwz_noun_dict[token.text] += 1
            else:
                hwz_noun_dict[token.text] = 1

hwz_noun_dict_sorted = sorted(hwz_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in hwz_noun_dict_sorted:
    print(i[0], i[1])

Apple 26
year 21
iPhone 16
Watch 12
Pro 12
iPhones 11
cycling 10
phones 9
watchOS 8
bike 8
cyclists 7
workout 6
detection 6
’s 6
fall 6
glass 6
Blue 6
Julz 5
Mini 5
Max 5
sides 5
update 4
call 4
impact 4
falls 4
models 4
Lightning 4
USB 4
colours 4
Sierra 4
time 3
watch 3
sensors 3
feature 3
motion 3
difference 3
port 3
C 3
Midnight 3
Pink 3
shade 3
Gold 3
finish 3
quality 2
Outdoor 2
features 2
functions 2
auto 2
pause 2
Garmin 2
Wahoo 2
fitness 2
company 2
video 2
people 2
computer 2
algorithms 2
data 2
heart 2
rate 2
traffic 2
watches 2
users 2
emergency 2
services 2
Series 2
cadence 2
power 2
customers 2
level 2
meter 2
feedback 2
article 2
team 2
Aftershock 2
custom 2
PC 2
curve 2
Topics 2
Sections 2
AWARDS 2
ACCOLADES 2
supporters 2
media 2
awards 2
industry 2
lineup 2
bodies 2
matte 2
backs 2
ports 2
point 2
thickness 2
battery 2
bit 2
cases 2
camera 2
notch 2
system 2
units 2
pink 2
process 2
model 2
California 1
Streaming 1
event 1
life 1
upgrades 1
Cycle 1
computers 1
mountai

In [153]:
cna_tagged = []
cna_noun_dict = {}
for sentence in cna_text1:
    cna_tagged.append(nlp(sentence))
for sentence in cna_text2:
    cna_tagged.append(nlp(sentence))
for tagged in cna_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in cna_noun_dict.keys():
                cna_noun_dict[token.text] += 1
            else:
                cna_noun_dict[token.text] = 1

cna_noun_dict_sorted = sorted(cna_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in cna_noun_dict_sorted:
    print(i[0], i[1])

##### Good grammar?
1. Subject-verb agreement
2. Tense matching

#### Subject-verb agreement

In [106]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
def is_passive(sentence):
    doc = nlp(sentence)
    dict1 = {'DEP': 'nsubjpass'}
    dict2 = {'DEP': 'aux', 'OP': '*'}
    dict3 = {'DEP': 'auxpass'}
    dict4 = {'TAG': 'VBN'}
    passive_rule = [dict1, dict2, dict3, dict4]
    matcher.add("Passive", [passive_rule])
    matches = matcher(doc)
    if matches:
        return True
    else:
        return False

https://github.com/armsp/active_or_passive/blob/master/spacy_voices.py

### Short break to count the % of passive sentences in the text

In [154]:
def passive_sents(text):
    passivecounts = 0
    for sent in text:
        if is_passive(sent):
            passivecounts += 1
    return passivecounts/len(text)

In [156]:
print("% of passive sentences in sof_text1: ", passive_sents(sof_text1))
print("% of passive sentences in sof_text2: ", passive_sents(sof_text2))
print("% of passive sentences in hwz_text1: ", passive_sents(hwz_text1))
print("% of passive sentences in hwz_text2: ", passive_sents(hwz_text2))
print("% of passive sentences in cna_text1: ", passive_sents(cna_text1))
print("% of passive sentences in cna_text2: ", passive_sents(cna_text2))

% of passive sentences in sof_text1:  0.0
% of passive sentences in sof_text2:  0.0
% of passive sentences in hwz_text1:  0.041666666666666664
% of passive sentences in hwz_text2:  0.125


ZeroDivisionError: division by zero

# Ok back to business

# I'm gonna see the percentage of short sentences first

## 1. Avg length of sentences 

In [157]:
def avg_sent_len(text):
    total_words = 0
    for sent in text:
        total_words += len(sent)
    return total_words/len(text)

In [159]:
print("Average length of sentences in sof_text1: ", avg_sent_len(sof_text1))
print("Average length of sentences in sof_text2: ", avg_sent_len(sof_text2))
print("Average length of sentences in hwz_text1: ", avg_sent_len(hwz_text1))
print("Average length of sentences in hwz_text2: ", avg_sent_len(hwz_text2))
print("Average length of sentences in cna_text1: ", avg_sent_len(cna_text1))
print("Average length of sentences in cna_text2: ", avg_sent_len(cna_text2))

Average length of sentences in sof_text1:  61.714285714285715
Average length of sentences in sof_text2:  87.54545454545455
Average length of sentences in hwz_text1:  233.45833333333334
Average length of sentences in hwz_text2:  168.625


ZeroDivisionError: division by zero

2. Short sentences in text

In [160]:
def num_short_sents(text):
    count = 0
    for sent in text:
        if len(sent) < 20:
            count += 1
    return count

In [163]:
print("Count of short sentences in sof_text1: ", num_short_sents(sof_text1), "out of", len(sof_text1), "sentences")
print("Count of short sentences in sof_text2: ", num_short_sents(sof_text2), "out of", len(sof_text2), "sentences")
print("Count of short sentences in hwz_text1: ", num_short_sents(hwz_text1), "out of", len(hwz_text1), "sentences")
print("Count of short sentences in hwz_text2: ", num_short_sents(hwz_text2), "out of", len(hwz_text2), "sentences")
print("Count of short sentences in cna_text1: ", num_short_sents(cna_text1), "out of", len(cna_text1), "sentences")
print("Count of short sentences in cna_text2: ", num_short_sents(cna_text2), "out of", len(cna_text2), "sentences")

Count of short sentences in sof_text1:  4 out of 21 sentences
Count of short sentences in sof_text2:  0 out of 11 sentences
Count of short sentences in hwz_text1:  7 out of 24 sentences
Count of short sentences in hwz_text2:  17 out of 32 sentences
Count of short sentences in cna_text1:  0 out of 0 sentences
Count of short sentences in cna_text2:  0 out of 0 sentences


#### Trying this package (language_tool_python)

In [121]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-BR')

In [122]:
matches_sof = []
for sent in sof_text1:
    matches_sof += tool.check(sent)
print(matches_sof)

[Match({'ruleId': 'UPPERCASE_SENTENCE_START', 'message': 'This sentence does not start with an uppercase letter.', 'replacements': ['But'], 'offsetInContext': 0, 'context': 'but not exactly the same, because i dont wa...', 'offset': 0, 'errorLength': 3, 'category': 'CASING', 'ruleIssueType': 'typographical', 'sentence': 'but not exactly the same, because i dont want to add description, i want to add "alt" attribute to img.'}), Match({'ruleId': 'EN_CONTRACTION_SPELLING', 'message': 'Possible spelling mistake found', 'replacements': ["don't"], 'offsetInContext': 36, 'context': 'but not exactly the same, because i dont want to add description, i want to add ...', 'offset': 36, 'errorLength': 4, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'but not exactly the same, because i dont want to add description, i want to add "alt" attribute to img.'}), Match({'ruleId': 'I_LOWERCASE', 'message': 'The personal pronoun “I” should be uppercase.', 'replacements': ['I'], 'offsetInC

In [123]:
matches_hwz = []
for sent in hwz_text1:
    matches_hwz += tool.check(sent)
print(matches_hwz)

[Match({'ruleId': 'DASH_RULE', 'message': 'Consider using an m-dash if you do not want to join two words.', 'replacements': ['—'], 'offsetInContext': 43, 'context': '...es for cycling have been pretty limited - functions like auto-pause and ride dete...', 'offset': 90, 'errorLength': 1, 'category': 'PUNCTUATION', 'ruleIssueType': 'typographical', 'sentence': 'Up until now, tracking features on the Apple watches for cycling have been pretty limited - functions like auto-pause and ride detection that most of us cyclists take for granted on our Garmin or Wahoo computers are absent.'}), Match({'ruleId': 'DASH_RULE', 'message': 'Consider using an m-dash if you do not want to join two words.', 'replacements': ['—'], 'offsetInContext': 43, 'context': '...t analyse data from the watch’s sensors – the GPS, heart rate, accelerometer and ...', 'offset': 169, 'errorLength': 1, 'category': 'PUNCTUATION', 'ruleIssueType': 'typographical', 'sentence': "Now it can automatically detect when you begin c

In [None]:
matches_cna = []
for sent in cna_text1:
    matches_cna += tool.check(sent)
print(matches_cna)

# Formality

#### No of second-person pronoun 'you'

In [170]:
def num_sec_pronoun(text):
    count = 0 
    for sent in text:
        if 'you' in sent:
            count += 1 
    return count

In [172]:
print("Count of second person pronoun in sof_text1: ", num_sec_pronoun(sof_text1))
print("Count of second person pronoun in sof_text2: ", num_sec_pronoun(sof_text2))
print("Count of second person pronoun in hwz_text1: ", num_sec_pronoun(hwz_text1))
print("Count of second person pronoun in hwz_text2: ", num_sec_pronoun(hwz_text2))
print("Count of second person pronoun in cna_text1: ", num_sec_pronoun(cna_text1))
print("Count of second person pronoun in cna_text2: ", num_sec_pronoun(cna_text2))

Count of second person pronoun in sof_text1:  7
Count of second person pronoun in sof_text2:  0
Count of second person pronoun in hwz_text1:  8
Count of second person pronoun in hwz_text2:  5
Count of second person pronoun in cna_text1:  0
Count of second person pronoun in cna_text2:  0


# So i learnt that this way is not good. i will attempt to try with spacy dependency tracker and then combine with nltk pos tagger to check if there is SVA. thx

# this is what i will do
# 3. continue with the two parts for checking for good grammar (any more ideas?)
# 4. think n implement more things to analyse for writing style



Findings:
1. Length of articles
    - SOF consistently the shortest, HWZ and CNA are higher by a big margin (+30 sentences on avg)
2. Proper nouns
    - SOF has the lowest counts of capitalisation for proper nouns (expected)
3. Kind of proper nouns used 
    - SOF always uses language from programming/coding domain
    - HWZ always writes about either telco/pop culture (movies?)
    - CNA 
4. Length of sentences (CHECK THIS!!)
    - SOF always has the shortest sentences
    - HWZ and CNA has much longer sentences (approx 100 words)
5. Counts of extremely short sentences
    - 

In [47]:
# to get counts of extremely short sentences
THRESHOLD = 6
extremely_short = [sentences for sentences in hwz_text1 if len(word_tokenize(sentences)) < THRESHOLD]

len(extremely_short)

0