In [279]:
# Import the libraries needed
from bs4 import BeautifulSoup, SoupStrainer
from collections import Counter
# import httplib2
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd 
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
import urllib.request
from urllib.request import urlopen, Request
import random
import re
import requests

In [280]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [281]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

## 3.2 Dataset Analysis

### Tokenisation and Stemming

In [282]:
# get reviews for a random business 
random_business = reviews.sample()
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
1805,GqwphH1Qi-8mJMEYAW_5vQ,X_526DZfpSriocKu9IJfWQ,AEx2SYEUJmTxVVB18LlCwA,4,0,0,0,"Small bologna and salami sandwich, toasted wit...",2013-12-28 20:36:26
1806,xd5cNOzx4jbWSgeIzJ_tVQ,vGohRKrqPgCJak3Jp6nGMA,AEx2SYEUJmTxVVB18LlCwA,4,2,0,2,"After a late breakfast at Faberge , decided to...",2014-07-28 02:10:34
1815,j7UP4zWrucADImEPVh4Brw,TYxVMGgyQR8SpXzdMZ_usQ,AEx2SYEUJmTxVVB18LlCwA,5,0,0,0,Classic place. Been around for 60+ years. Best...,2016-05-31 04:35:46
1818,XgC63yPcorK-M2KvxoytTw,0C_kTzETOV_1m-fSeYGuJQ,AEx2SYEUJmTxVVB18LlCwA,5,0,0,1,As the simple sandwich and egg creme soda goes...,2018-06-26 18:56:05
1820,MuYQSNxUFqlaLeR5w2TAUA,fpfidtaJqKlSeBASjJUsyQ,AEx2SYEUJmTxVVB18LlCwA,5,1,0,1,Damn this place is so great. The decor is very...,2017-06-17 19:08:10


In [283]:
small_business_dataset_reviews = list(small_business_dataset['text'])

In [284]:
# convert the reviews into a concatenated string 
b1_review = ''.join(small_business_dataset_reviews)
clean_review = re.sub(r"[^A-Za-z0-9\s]+", "", b1_review)
b1_review = nlp(clean_review)

In [285]:
# removed punctuation and get the top 10 most common words (including stopwords)
b1_review_words = [token.text for token in b1_review if token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('the', 394), ('and', 315), ('a', 281), ('is', 180), ('to', 175), ('I', 175), ('of', 154), ('in', 124), ('you', 122), ('it', 120)]


In [286]:
# removed punctuation and get the top 10 most common words (excluding stopwords)
b1_review_words = [token.text for token in b1_review if token.is_stop != True and token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('sandwich', 90), ('place', 84), ('nt', 77), ('mustard', 50), ('special', 41), ('like', 40), ('soda', 36), ('bologna', 35), ('cheese', 35), ('Montreal', 35)]


In [287]:
#TODO: plot log graph


In [288]:
# now we do some stemming after removing the stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

porter_st = PorterStemmer()
lancaster_st = LancasterStemmer()
snow_st = SnowballStemmer("english")

In [289]:
# Using Porter Stemmer
porter_stemmed_words = [porter_st.stem(word) for word in b1_review_words]
porter_freq = Counter(porter_stemmed_words)
porter_common = porter_freq.most_common(10)
print(porter_common)

[('sandwich', 108), ('place', 89), ('nt', 77), ('special', 76), ('wilenski', 64), ('mustard', 51), ('soda', 48), ('order', 45), ('de', 42), ('like', 41)]


In [290]:
# Using Lancaster Stemmer
lancaster_stemmed_words = [lancaster_st.stem(word) for word in b1_review_words]
lancaster_freq = Counter(lancaster_stemmed_words)
lancaster_common = lancaster_freq.most_common(10)
print(lancaster_common)

[('sandwich', 108), ('plac', 89), ('nt', 77), ('spec', 76), ('wilensky', 63), ('mustard', 51), ('ord', 45), ('mont', 43), ('bologn', 41), ('lik', 41)]


In [291]:
# Using Snowball Stemmer
snow_stemmed_words = [snow_st.stem(word) for word in b1_review_words]
snow_freq = Counter(snow_stemmed_words)
snow_common = snow_freq.most_common(10)
print(snow_common)

[('sandwich', 108), ('place', 89), ('nt', 77), ('special', 76), ('wilenski', 64), ('mustard', 51), ('soda', 48), ('order', 45), ('like', 41), ('bologna', 38)]


### POS Tagging

In [292]:
random_sentences = reviews.sample(5, random_state=42)
random_sentences = list(random_sentences['text'])

In [293]:
random_sentences[0]

"Que ce soit pour leurs délicieux bubbles tea/smooties, leurs ''Bánh mì'' , leurs petits snacks (viennoiseries, tapioca, ...), on adore Vua et aussi leurs prix très abordables. On y retourne lorsqu'on est dans le Quartier Latin !"

In [294]:
nltk_tagged = []
for sentence in random_sentences:
    nltk_tagged.append((nltk.pos_tag(word_tokenize(sentence))))
nltk_tagged

[[('Que', 'NNP'),
  ('ce', 'NN'),
  ('soit', 'VBD'),
  ('pour', 'JJ'),
  ('leurs', 'NNS'),
  ('délicieux', 'VBP'),
  ('bubbles', 'NNS'),
  ('tea/smooties', 'NNS'),
  (',', ','),
  ('leurs', 'VBZ'),
  ('``', '``'),
  ('Bánh', 'NNP'),
  ('mì', 'NN'),
  ("''", "''"),
  (',', ','),
  ('leurs', 'VBZ'),
  ('petits', 'NNS'),
  ('snacks', 'NNS'),
  ('(', '('),
  ('viennoiseries', 'NNS'),
  (',', ','),
  ('tapioca', 'NN'),
  (',', ','),
  ('...', ':'),
  (')', ')'),
  (',', ','),
  ('on', 'IN'),
  ('adore', 'IN'),
  ('Vua', 'NNP'),
  ('et', 'CC'),
  ('aussi', 'JJ'),
  ('leurs', 'NNS'),
  ('prix', 'VBP'),
  ('très', 'JJ'),
  ('abordables', 'NNS'),
  ('.', '.'),
  ('On', 'IN'),
  ('y', 'JJ'),
  ('retourne', 'JJ'),
  ("lorsqu'on", 'NN'),
  ('est', 'JJS'),
  ('dans', 'NNS'),
  ('le', 'VBP'),
  ('Quartier', 'NNP'),
  ('Latin', 'NNP'),
  ('!', '.')],
 [('As', 'IN'),
  ('I', 'PRP'),
  ("'ve", 'VBP'),
  ('said', 'VBD'),
  ('previously', 'RB'),
  ('...', ':'),
  ('we', 'PRP'),
  ("'ve", 'VBP'),
  ('been

In [295]:
nlp = spacy.load("en_core_web_sm")
spacy_tagged = []
for sentence in random_sentences:
    spacy_tagged.append(nlp(sentence))
for tagged in spacy_tagged:
    for token in tagged:
        print(f'{token.text:{8}} {token.pos_:{6}}')

Que      PROPN 
ce       PROPN 
soit     ADJ   
pour     NOUN  
leurs    VERB  
délicieux NOUN  
bubbles  NOUN  
tea      NOUN  
/        SYM   
smooties NOUN  
,        PUNCT 
leurs    VERB  
''       PUNCT 
Bánh     PROPN 
mì       INTJ  
''       PUNCT 
,        PUNCT 
leurs    NOUN  
petits   VERB  
snacks   NOUN  
(        PUNCT 
viennoiseries NOUN  
,        PUNCT 
tapioca  INTJ  
,        PUNCT 
...      PUNCT 
)        PUNCT 
,        PUNCT 
on       ADP   
adore    PROPN 
Vua      PROPN 
et       PROPN 
aussi    PROPN 
leurs    VERB  
prix     NOUN  
très     ADJ   
abordables NOUN  
.        PUNCT 
On       ADP   
y        PROPN 
retourne VERB  
lorsqu'on PROPN 
est      PROPN 
dans     PROPN 
le       X     
Quartier PROPN 
Latin    PROPN 
!        PUNCT 
As       ADP   
I        PRON  
've      AUX   
said     VERB  
previously ADV   
...      PUNCT 
we've    PROPN 
been     AUX   
coming   VERB  
to       ADP   
LMAH     PROPN 
for      ADP   
over     ADP   
10       NUM 

# WORK COMPLETED UP TILL HERE.

### Writing Style

#### Getting SOF article data (originally random, source code in get_urls.py)

In [409]:
page1 = requests.get('https://stackoverflow.com/questions/41306813')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
sof_text1 = [txt.get_text() for txt in text]

In [410]:
page1 = requests.get('https://stackoverflow.com/questions/16498749')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
sof_text2 = [txt.get_text() for txt in text]

#### Getting HWZ article data (originally random, source code in get_urls.py)

In [411]:
page1 = requests.get('https://www.hardwarezone.com.sg/tech-news-shang-chi-legend-ten-rings-available-streaming-disney-plus-day')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
hwz_text1 = [txt.get_text() for txt in text]

In [412]:
page1 = requests.get('https://www.hardwarezone.com.sg/feature-apple-iphone-13-iphone-13-pro-2021-review-singapore-price-specs')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
hwz_text2 = [txt.get_text() for txt in text]

#### Getting CNA article data (originally random, source code in get_urls.py)

In [413]:
page1 = requests.get('https://www.channelnewsasia.com/world/macron-tells-europe-stop-being-naive-after-france-signs-defence-deal-greece-2207351')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
cna_text1 = [txt.get_text() for txt in text]

In [414]:
page1 = requests.get('https://www.channelnewsasia.com/singapore/de-beers-wong-tian-jun-sugar-daddy-appeal-sentence-2199481')
soup1 = BeautifulSoup(page1.content, "html.parser")
text = list(soup1.find_all("p"))
cna_text2 = [txt.get_text() for txt in text]

##### Cleaning SOF text data

In [415]:
sof_text1 = sof_text1[4:]
sof_text1 = sof_text1[:-10]
temp_list = []
for line in sof_text1:
    temp_list += sent_tokenize(line)
sof_text1 = temp_list

In [416]:
sof_text1

['I am using a BlueImp Gallery to add lightboxes to my image gallery.',
 'So, when you click on an image thumbnail, it launches a lightbox with a larger version of the image etc.',
 'I also want to add in alt attribute to larger lightbox image, but I am having trouble making it work.',
 "It won't show the alt attributes that I have added in.",
 "Here's what I have so far;",
 'HTML:',
 'CSS:',
 'JS:',
 "And in my body (gallery.js is the file where I've added the above JS):",
 'And some fiddle about this',
 'http://jsfiddle.net/LXp76/70/',
 "Any pointers on where I've gone wrong would be much appreciated!",
 'My question is similar with this topic:',
 'Adding descriptions inside a blueimp gallery',
 'but not exactly the same, because i dont want to add description, i want to add "alt" attribute to img.']

In [417]:
sof_text2 = sof_text2[4:]
sof_text2 = sof_text2[:-10]
temp_list = []
for line in sof_text2:
    temp_list += sent_tokenize(line)
sof_text2 = temp_list

In [418]:
sof_text2

['I have several files for an app in image processing.',
 'As the number of rows and colums for an image does not change while doing some image processing algorithm I was trying to put those values in constant memory.',
 'My app looks like:',
 'Imageproc.cuh',
 'Imageproc.cu',
 'It compiles well but when trying to run the program I get invalid device symbol cudaMemcpyToSymbol(&c_rows, &rows, sizeof(int))',
 "Can't I put those variables in constant memory or what am I missing?",
 'If your symbol is declared like this:',
 'then the correct call to cudaMemcpyToSymbol is just']

##### Cleaning CNA text data

In [419]:
cna_text1 = cna_text1[2:]
cna_text1 = cna_text1[:-7]
temp_list = []
for line in cna_text1:
    temp_list += sent_tokenize(line)
cna_text1 = temp_list

In [420]:
cna_text1

['French President Emmanuel Macron gives a press conference with Greek Prime Minister during a signing ceremony of a new defence deal at The Elysee Palace in Paris, France September 28, 2021.',
 'Ludovic Marin/Pool via REUTERS',
 'PARIS: Europe needs to stop being naive when it comes to defending its interests and build its own military capacity, French President Emmanuel Macron said on Tuesday (Sep 28) after Greece sealed a deal for French frigates worth about €3 billion (US$3.51 billion).',
 'France was plunged into an unprecedented diplomatic crisis with the United States, Australia and Britain earlier this month over a trilateral nuclear security deal which sank a multi-billion dollar French-designed submarine contract with Canberra.',
 'That has caused much soul searching in Paris over its traditional alliances.',
 'Speaking for the first time on the issue, Macron on Tuesday seized the opportunity to urge for more European autonomy as Washington increasingly reorientates its inter

In [421]:
cna_text2 = cna_text2[2:]
cna_text2 = cna_text2[:-8]
temp_list = []
for line in cna_text2:
    temp_list += sent_tokenize(line)
cna_text2 = temp_list

In [422]:
cna_text2

[]

##### Cleaning HWZ text data

In [423]:
hwz_text1 = hwz_text1[:-5]
temp_list = []
for line in hwz_text1:
    temp_list += sent_tokenize(line)
hwz_text1 = temp_list

In [424]:
hwz_text1

['On 12 November, The Walt Disney Company will host Disney+ Day, a global celebration that will also see\xa0subscribers in Singapore being\xa0treated to new content releases across the service’s brands, Disney, Pixar, Marvel, Star Wars, National Geographic, and Star, along with a special presentation on Disney+ with sneak peeks into what’s to come.',
 'In a press release, Disney also revealed that subscribers will have access to promotions and experiences across the Company.',
 'For instance, Disney Parks and Resorts around the world and Disney Cruise Line will "roll out the blue carpet" for Disney+ fans with some surprise and delight moments including photo opportunities, character moments and more.',
 "Obviously, these won't be available to Singapore-based subscribers\xa0unless you happen to be in the US, Paris or Tokyo, where the theme parks and resorts are available.",
 'Disney, however, did say that more localised details will be shared in the coming weeks.',
 '“This day of apprec

In [425]:
hwz_text2 = hwz_text2[:-5]
temp_list = []
for line in hwz_text2:
    temp_list += sent_tokenize(line)
hwz_text2 = temp_list

In [426]:
hwz_text2

["Note: This review was first published on 21 September 2021 and it's republished now because the new iPhones are available in retail.",
 'Despite reservations about the name, Apple has stuck with what’s sensible and went with the number 13 for their latest iPhones.',
 'You might have already heard, Apple’s newest iPhones are the iPhone 13 and iPhone 13 Mini, and the iPhone 13 Pro and iPhone 13 Pro Max .',
 'No surprises, there.',
 'However, there are some profound changes compared to last year’s lineup and that could affect your purchasing decision.',
 'Sit down, grab a cuppa, this is a long one.',
 'Last year’s iPhones\xa0got\xa0a major redesign so we knew we weren’t going to be getting phones that look drastically different this year.',
 'The basic recipe for this year’s iPhones is nearly identical to last year’s models.',
 'In fact, unless you know what to look out for, you’d be hard-pressed to tell the difference between this year’s models and last year’s.',
 'They all have flat s

##### First word in sentence capitalized?

In [427]:
count=0
uppercount=0
for sent in sof_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for sof_text1: ", uppercount/count)

Fraction of first letter being capitalised for sof_text1:  0.8666666666666667


In [428]:
count=0
uppercount=0
for sent in sof_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for sof_text2: ", uppercount/count)

Fraction of first letter being capitalised for sof_text2:  0.8888888888888888


In [433]:
count=0
uppercount=0
for sent in hwz_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for hwz_text1: ", uppercount/count)

Fraction of first letter being capitalised for hwz_text1:  0.75


In [430]:
count=0
uppercount=0
for sent in hwz_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for hwz_text2: ", uppercount/count)

Fraction of first letter being capitalised for hwz_text2:  1.0


In [431]:
count=0
uppercount=0
for sent in cna_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for cna_text1: ", uppercount/count)

Fraction of first letter being capitalised for cna_text1:  0.75


In [432]:
count=0
uppercount=0
for sent in cna_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for cna_text2: ", uppercount/count)

ZeroDivisionError: division by zero

##### Length of articles?

In [None]:
print("No of sentences in sof_text1: ", len(sof_text1))
print("No of sentences in sof_text2: ", len(sof_text2))
print("No of sentences in hwz_text1: ", len(hwz_text1))
print("No of sentences in hwz_text2: ", len(hwz_text2))
print("No of sentences in cna_text1: ", len(cna_text1))
print("No of sentences in cna_text2: ", len(cna_text2))

##### Proper nouns capitalised?

In [None]:
tagged = []
uppercount = 0
count = 0
for sentence in sof_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in sof_text1: ', uppercount/count)

In [None]:
tagged = []
uppercount = 0
count = 0
for sentence in sof_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in sof_text2: ', uppercount/count)

In [None]:
tagged = []
uppercount = 0
count = 0
for sentence in hwz_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in hwz_text1: ', uppercount/count)

In [None]:
tagged = []
uppercount = 0
count = 0
for sentence in hwz_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in hwz_text2: ', uppercount/count)

In [None]:
tagged = []
uppercount = 0
count = 0
for sentence in cna_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in cna_text1: ', uppercount/count)

In [None]:
tagged = []
uppercount = 0
count = 0
for sentence in cna_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in cna_text2: ', uppercount/count)

###### What kind of proper nouns used?

1. Stack Overflow

In [None]:
sof_tagged = []
sof_noun_dict = {}
for sentence in sof_text1:
    sof_tagged.append(nlp(sentence))
for sentence in sof_text2:
    sof_tagged.append(nlp(sentence))
for tagged in sof_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in sof_noun_dict.keys():
                sof_noun_dict[token.text] += 1
            else:
                sof_noun_dict[token.text] = 1

sof_noun_dict_sorted = sorted(sof_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in sof_noun_dict_sorted:
    print(i[0], i[1])

In [None]:
hwz_tagged = []
hwz_noun_dict = {}
for sentence in hwz_text1:
    hwz_tagged.append(nlp(sentence))
for sentence in hwz_text2:
    hwz_tagged.append(nlp(sentence))
for tagged in hwz_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in hwz_noun_dict.keys():
                hwz_noun_dict[token.text] += 1
            else:
                hwz_noun_dict[token.text] = 1

hwz_noun_dict_sorted = sorted(hwz_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in hwz_noun_dict_sorted:
    print(i[0], i[1])

In [None]:
cna_tagged = []
cna_noun_dict = {}
for sentence in cna_text1:
    cna_tagged.append(nlp(sentence))
for sentence in cna_text2:
    cna_tagged.append(nlp(sentence))
for tagged in cna_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in cna_noun_dict.keys():
                cna_noun_dict[token.text] += 1
            else:
                cna_noun_dict[token.text] = 1

cna_noun_dict_sorted = sorted(cna_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in cna_noun_dict_sorted:
    print(i[0], i[1])

##### Good grammar?
1. Subject-verb agreement
2. Tense matching

#### Subject-verb agreement

In [None]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
def is_passive(sentence):
    doc = nlp(sentence)
    dict1 = {'DEP': 'nsubjpass'}
    dict2 = {'DEP': 'aux', 'OP': '*'}
    dict3 = {'DEP': 'auxpass'}
    dict4 = {'TAG': 'VBN'}
    passive_rule = [dict1, dict2, dict3, dict4]
    matcher.add("Passive", [passive_rule])
    matches = matcher(doc)
    if matches:
        return True
    else:
        return False

https://github.com/armsp/active_or_passive/blob/master/spacy_voices.py

### Short break to count the % of passive sentences in the text

In [None]:
sof_text1_passivecounts = 0
sof_text2_passivecounts = 0 

for sent in sof_text1:
    if is_passive(sent):
        sof_text1_passivecounts += 1
for sent in sof_text2:
    if is_passive(sent):
        sof_text2_passivecounts += 1

print("% of passive sentences in sof_text1: ", sof_text1_passivecounts/len(sof_text1))
print("% of passive sentences in sof_text2: ", sof_text2_passivecounts/len(sof_text2))

In [None]:
hwz_text1_passivecounts = 0
hwz_text2_passivecounts = 0 

for sent in hwz_text1:
    if is_passive(sent):
        hwz_text1_passivecounts += 1
for sent in hwz_text2:
    if is_passive(sent):
        hwz_text2_passivecounts += 1

print("% of passive sentences in hwz_text1: ", hwz_text1_passivecounts/len(hwz_text1))
print("% of passive sentences in hwz_text2: ", hwz_text2_passivecounts/len(hwz_text2))

In [None]:
cna_text1_passivecounts = 0
cna_text2_passivecounts = 0 

for sent in cna_text1:
    if is_passive(sent):
        cna_text1_passivecounts += 1
for sent in cna_text2:
    if is_passive(sent):
        cna_text2_passivecounts += 1

print("% of passive sentences in cna_text1: ", cna_text1_passivecounts/len(cna_text1))
print("% of passive sentences in cna_text2: ", cna_text2_passivecounts/len(cna_text2))

# Ok back to business

# I'm gonna see the percentage of short sentences first

## 1. Avg length of sentences 

In [None]:
total_words = 0
for sent in sof_text1:
    total_words += len(sent)
print("Average length of sentences in sof_text1: ", total_words/len(sof_text1))

In [None]:
total_words = 0
for sent in sof_text2:
    total_words += len(sent)
print("Average length of sentences in sof_text2: ", total_words/len(sof_text2))

In [None]:
total_words = 0
for sent in hwz_text1:
    total_words += len(sent)
print("Average length of sentences in hwz_text1: ", total_words/len(hwz_text1))

In [None]:
total_words = 0
for sent in hwz_text2:
    total_words += len(sent)
print("Average length of sentences in hwz_text2: ", total_words/len(hwz_text2))

In [None]:
total_words = 0
for sent in cna_text1:
    total_words += len(sent)
print("Average length of sentences in cna_text1: ", total_words/len(cna_text1))

In [None]:
total_words = 0
for sent in cna_text2:
    total_words += len(sent)
print("Average length of sentences in cna_text2: ", total_words/len(cna_text2))

2. Short sentences in text

In [None]:
count = 0
for sent in sof_text1:
    if len(sent) < 20:
        count += 1
print("Count of short sentences in sof_text1: ", count)
print("Total no of sentences in sof_text1: ", len(sof_text1))

In [None]:
count = 0
for sent in sof_text2:
    if len(sent) < 20:
        count += 1
print("Count of short sentences in sof_text2: ", count)
print("Total no of sentences in sof_text2: ", len(sof_text2))

In [None]:
count = 0
for sent in hwz_text1:
    if len(sent) < 20:
        count += 1
print("Count of short sentences in hwz_text1: ", count)
print("Total no of sentences in hwz_text1: ", len(hwz_text1))

In [None]:
count = 0
for sent in hwz_text2:
    if len(sent) < 20:
        count += 1
print("Count of short sentences in hwz_text2: ", count)
print("Total no of sentences in hwz_text2: ", len(hwz_text2))

In [None]:
count = 0
for sent in cna_text1:
    if len(sent) < 20:
        count += 1
print("Count of short sentences in cna_text1: ", count)
print("Total no of sentences in cna_text1: ", len(cna_text1))
print(cna_text1)

In [None]:
count = 0
for sent in cna_text2:
    if len(sent) < 20:
        count += 1
print("Count of short sentences in cna_text2: ", count)
print("Total no of sentences in cna_text2: ", len(cna_text2))
print(cna_text2)

#### Trying this package (language_tool_python)

In [441]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

Downloading LanguageTool: 100%|█████████████████████████████████████████████████████| 203M/203M [00:22<00:00, 9.17MB/s]
Unzipping C:\Users\colot\AppData\Local\Temp\tmprlqli8ip.zip to C:\Users\colot\.cache\language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-5.4.zip to C:\Users\colot\.cache\language_tool_python.


In [446]:
matches_sof = []
for sent in sof_text1:
    matches_sof += tool.check(sent)
print(matches_sof)

[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['Blimp', 'Blueish'], 'offsetInContext': 13, 'context': 'I am using a BlueImp Gallery to add lightboxes to my image g...', 'offset': 13, 'errorLength': 7, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'I am using a BlueImp Gallery to add lightboxes to my image gallery.'}), Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['light boxes'], 'offsetInContext': 36, 'context': 'I am using a BlueImp Gallery to add lightboxes to my image gallery.', 'offset': 36, 'errorLength': 10, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'I am using a BlueImp Gallery to add lightboxes to my image gallery.'}), Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['light box'], 'offsetInContext': 43, 'context': '...ck on an image thumbnail, it launches a light

In [447]:
matches_hwz = []
for sent in hwz_text1:
    matches_hwz += tool.check(sent)
print(matches_hwz)

[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake. ‘localised’ is British English.', 'replacements': ['localized'], 'offsetInContext': 35, 'context': 'Disney, however, did say that more localised details will be shared in the coming we...', 'offset': 35, 'errorLength': 9, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'Disney, however, did say that more localised details will be shared in the coming weeks.'}), Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['tent pole'], 'offsetInContext': 43, 'context': '...storytelling, and will become an annual tentpole event to be amplified across our global...', 'offset': 196, 'errorLength': 8, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': '“This day of appreciation brings to life our mission to entertain, inform, and inspire fans and families around the globe through the power of unparalleled storytelling, and will become 

In [449]:
matches_cna = []
for sent in cna_text1:
    matches_cna += tool.check(sent)
print(matches_cna)

[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake. ‘defence’ is British English.', 'replacements': ['defense'], 'offsetInContext': 43, 'context': '...ster during a signing ceremony of a new defence deal at The Elysee Palace in Paris, Fra...', 'offset': 119, 'errorLength': 7, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'French President Emmanuel Macron gives a press conference with Greek Prime Minister during a signing ceremony of a new defence deal at The Elysee Palace in Paris, France September 28, 2021.'}), Match({'ruleId': 'EN_DIACRITICS_REPLACE', 'message': '‘Elysee’ is an imported foreign expression, which originally has a diacritic. Consider using “Élysée”', 'replacements': ['Élysée'], 'offsetInContext': 43, 'context': '...g ceremony of a new defence deal at The Elysee Palace in Paris, France September 28, 2...', 'offset': 139, 'errorLength': 6, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'French President E

# Formality

#### No of second-person pronoun 'you'

In [434]:
count = 0 
for sent in sof_text1:
    if 'you' in sent:
        count += 1
print(count)

1


In [436]:
count = 0 
for sent in sof_text2:
    if 'you' in sent:
        count += 1
print(count)

1


In [437]:
count = 0 
for sent in hwz_text1:
    if 'you' in sent:
        count += 1
print(count)

1


In [438]:
count = 0 
for sent in hwz_text2:
    if 'you' in sent:
        count += 1
print(count)

5


In [439]:
count = 0 
for sent in cna_text1:
    if 'you' in sent:
        count += 1
print(count)

0


In [440]:
count = 0 
for sent in cna_text2:
    if 'you' in sent:
        count += 1
print(count)

0


# So i learnt that this way is not good. i will attempt to try with spacy dependency tracker and then combine with nltk pos tagger to check if there is SVA. thx

# this is what i will do
# 3. continue with the two parts for checking for good grammar (any more ideas?)
# 4. think n implement more things to analyse for writing style
# Formality vs informality: ??


Findings:
1. Length of articles
    - SOF consistently the shortest, HWZ and CNA are higher by a big margin (+30 sentences on avg)
2. Proper nouns
    - SOF has the lowest counts of capitalisation for proper nouns (expected)
3. Kind of proper nouns used 
    - SOF always uses language from programming/coding domain
    - HWZ always writes about either telco/pop culture (movies?)
    - CNA 
4. Length of sentences (CHECK THIS!!)
    - SOF always has the shortest sentences
    - HWZ and CNA has much longer sentences (approx 100 words)
5. Counts of extremely short sentences
    - 