In [152]:
# Import the libraries needed
from bs4 import BeautifulSoup, SoupStrainer
from collections import Counter
# import httplib2
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd 
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
import urllib.request
from urllib.request import urlopen, Request
import random
import re
import requests

In [153]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [154]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

## 3.2 Dataset Analysis

### Tokenisation and Stemming

In [155]:
# get reviews for a random business 
random_business = reviews.sample()
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
3101,kSylHWOYcVoo8UcLcYZFpg,gm3J6L35xYUMgwfrQOas9A,kAmNIcJpAZIZIp5uSMAYLA,1,18,3,3,BEWARE!!! DO NOT LIVE HERE!!! REPEAT DO NOT LI...,2015-08-06 20:48:47
3108,nrVucC_UZQgPDaAdW6f-_w,OoyQYSeYNyRVOmdO3tsxYA,kAmNIcJpAZIZIp5uSMAYLA,2,11,7,3,"PROS: Student living is the worst anywhere, bu...",2013-12-07 06:45:50
3127,-o1FmGFSfe6mbDTYk-PHag,0Qku9zA5ytwzH6dYt2cg8Q,kAmNIcJpAZIZIp5uSMAYLA,1,10,3,1,Rebel Place Apartments WAS a nice place to liv...,2015-07-03 23:24:05
3142,j3FmbADiyjW8AYqcnJG9Aw,Lb12DRCZgRuiKtlwC1-WAA,kAmNIcJpAZIZIp5uSMAYLA,1,2,0,0,"If I could give no stars, i would. the manager...",2018-09-03 05:28:38
3150,4rTk9J67QTbtfKOVQxMBNQ,piQb_aEq6aimVfZAXyqmuA,kAmNIcJpAZIZIp5uSMAYLA,3,0,0,0,I lived here a few years ago and it was okay. ...,2017-05-26 15:12:32


In [156]:
small_business_dataset_reviews = list(small_business_dataset['text'])

In [157]:
# convert the reviews into a concatenated string 
b1_review = ''.join(small_business_dataset_reviews)
clean_review = re.sub(r"[^A-Za-z0-9\s]+", "", b1_review)
b1_review = nlp(clean_review)

In [158]:
# removed punctuation and get the top 10 most common words (including stopwords)
b1_review_words = [token.text for token in b1_review if token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('the', 866), ('and', 678), ('to', 668), ('I', 530), ('a', 496), ('in', 382), ('is', 308), ('of', 307), ('you', 287), ('that', 284)]


In [159]:
# removed punctuation and get the top 10 most common words (excluding stopwords)
b1_review_words = [token.text for token in b1_review if token.is_stop != True and token.is_alpha == True] 
b1_word_freq = Counter(b1_review_words)
common_words = b1_word_freq.most_common(10)
print(common_words)

[('nt', 175), ('place', 151), ('apartment', 82), ('management', 79), ('room', 60), ('people', 58), ('Rebel', 56), ('moved', 54), ('office', 53), ('living', 51)]


In [160]:
#TODO: plot log graph


In [161]:
# now we do some stemming after removing the stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

porter_st = PorterStemmer()
lancaster_st = LancasterStemmer()
snow_st = SnowballStemmer("english")

In [162]:
# Using Porter Stemmer
porter_stemmed_words = [porter_st.stem(word) for word in b1_review_words]
porter_freq = Counter(porter_stemmed_words)
porter_common = porter_freq.most_common(10)
print(porter_common)

[('place', 208), ('nt', 175), ('live', 128), ('manag', 115), ('apart', 113), ('move', 94), ('rebel', 83), ('room', 76), ('time', 72), ('peopl', 67)]


In [163]:
# Using Lancaster Stemmer
lancaster_stemmed_words = [lancaster_st.stem(word) for word in b1_review_words]
lancaster_freq = Counter(lancaster_stemmed_words)
lancaster_common = lancaster_freq.most_common(10)
print(lancaster_common)

[('plac', 208), ('nt', 175), ('room', 132), ('liv', 129), ('man', 119), ('apart', 113), ('mov', 94), ('rebel', 83), ('tim', 72), ('peopl', 67)]


In [164]:
# Using Snowball Stemmer
snow_stemmed_words = [snow_st.stem(word) for word in b1_review_words]
snow_freq = Counter(snow_stemmed_words)
snow_common = snow_freq.most_common(10)
print(snow_common)

[('place', 208), ('nt', 175), ('live', 128), ('manag', 115), ('apart', 113), ('move', 94), ('rebel', 83), ('room', 76), ('time', 72), ('peopl', 67)]


### POS Tagging

In [165]:
random_sentences = reviews.sample(5, random_state=42)
random_sentences = list(random_sentences['text'])

In [166]:
random_sentences[0]

"Que ce soit pour leurs délicieux bubbles tea/smooties, leurs ''Bánh mì'' , leurs petits snacks (viennoiseries, tapioca, ...), on adore Vua et aussi leurs prix très abordables. On y retourne lorsqu'on est dans le Quartier Latin !"

In [167]:
nltk_tagged = []
for sentence in random_sentences:
    nltk_tagged.append((nltk.pos_tag(word_tokenize(sentence))))
nltk_tagged

[[('Que', 'NNP'),
  ('ce', 'NN'),
  ('soit', 'VBD'),
  ('pour', 'JJ'),
  ('leurs', 'NNS'),
  ('délicieux', 'VBP'),
  ('bubbles', 'NNS'),
  ('tea/smooties', 'NNS'),
  (',', ','),
  ('leurs', 'VBZ'),
  ('``', '``'),
  ('Bánh', 'NNP'),
  ('mì', 'NN'),
  ("''", "''"),
  (',', ','),
  ('leurs', 'VBZ'),
  ('petits', 'NNS'),
  ('snacks', 'NNS'),
  ('(', '('),
  ('viennoiseries', 'NNS'),
  (',', ','),
  ('tapioca', 'NN'),
  (',', ','),
  ('...', ':'),
  (')', ')'),
  (',', ','),
  ('on', 'IN'),
  ('adore', 'IN'),
  ('Vua', 'NNP'),
  ('et', 'CC'),
  ('aussi', 'JJ'),
  ('leurs', 'NNS'),
  ('prix', 'VBP'),
  ('très', 'JJ'),
  ('abordables', 'NNS'),
  ('.', '.'),
  ('On', 'IN'),
  ('y', 'JJ'),
  ('retourne', 'JJ'),
  ("lorsqu'on", 'NN'),
  ('est', 'JJS'),
  ('dans', 'NNS'),
  ('le', 'VBP'),
  ('Quartier', 'NNP'),
  ('Latin', 'NNP'),
  ('!', '.')],
 [('As', 'IN'),
  ('I', 'PRP'),
  ("'ve", 'VBP'),
  ('said', 'VBD'),
  ('previously', 'RB'),
  ('...', ':'),
  ('we', 'PRP'),
  ("'ve", 'VBP'),
  ('been

In [168]:
nlp = spacy.load("en_core_web_sm")
spacy_tagged = []
for sentence in random_sentences:
    spacy_tagged.append(nlp(sentence))
for tagged in spacy_tagged:
    for token in tagged:
        print(f'{token.text:{8}} {token.pos_:{6}}')

Que      PROPN 
ce       PROPN 
soit     ADJ   
pour     NOUN  
leurs    VERB  
délicieux NOUN  
bubbles  NOUN  
tea      NOUN  
/        SYM   
smooties NOUN  
,        PUNCT 
leurs    VERB  
''       PUNCT 
Bánh     PROPN 
mì       INTJ  
''       PUNCT 
,        PUNCT 
leurs    NOUN  
petits   VERB  
snacks   NOUN  
(        PUNCT 
viennoiseries NOUN  
,        PUNCT 
tapioca  INTJ  
,        PUNCT 
...      PUNCT 
)        PUNCT 
,        PUNCT 
on       ADP   
adore    PROPN 
Vua      PROPN 
et       PROPN 
aussi    PROPN 
leurs    VERB  
prix     NOUN  
très     ADJ   
abordables NOUN  
.        PUNCT 
On       ADP   
y        PROPN 
retourne VERB  
lorsqu'on PROPN 
est      PROPN 
dans     PROPN 
le       X     
Quartier PROPN 
Latin    PROPN 
!        PUNCT 
As       ADP   
I        PRON  
've      AUX   
said     VERB  
previously ADV   
...      PUNCT 
we've    PROPN 
been     AUX   
coming   VERB  
to       ADP   
LMAH     PROPN 
for      ADP   
over     ADP   
10       NUM 

# WORK COMPLETED UP TILL HERE.

### Writing Style

In [169]:
def getSOFurl():
    while(True):
        ip0 = 'stackoverflow'
        ip1 = 'com'
        ip2 = 'questions'
        ip3 = str(random.randint(0, 100000000))
        url = 'https://' + ip0 + '.' + ip1 + '/'+ ip2 + '/'+ ip3
        try:
            urlContent = urlopen(url).read()
            if urlContent:
                break
        except Exception as e: 
            print(e)
            pass

    print("Found URL: " + url)
    page1 = requests.get(url)
    soup1 = BeautifulSoup(page1.content, "html.parser")
    text = list(soup1.find_all("p"))
    sof_text = [txt.get_text() for txt in text]
   
    return sof_text


In [170]:
def getHWZurl():
    url = 'https://www.hardwarezone.com.sg/home'
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    urls = []
    for link in soup.find_all('a'):
        urls.append(link.get('href'))
    random.shuffle(urls)
    while(True):
        for url in urls:
            print(url)
            try:
                url = 'https://www.hardwarezone.com.sg' + url
                urlContent = urlopen(url).read()
                page1 = requests.get(url)
                soup1 = BeautifulSoup(page1.content, "html.parser")
                text = list(soup1.find_all("p"))
                hwz_text = [txt.get_text() for txt in text]
                if urlContent and len(hwz_text)>10:
                    break
            except Exception as e: 
                print(e)
        break
    print("Found URL: " + url)
    return hwz_text

In [171]:
from urllib.parse import urlparse, urljoin
def getCNAurl():
    url = 'https://www.channelnewsasia.com/'
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    linked_urls = set()

    def get_all_links(url):
        urls = set()
        domain_name = urlparse(url).netloc
        soup = BeautifulSoup(requests.get(url).content, "html.parser")
        for a_tag in soup.findAll("a"):
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                # href empty tag
                continue
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            # remove URL GET parameters, URL fragments, etc.
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if href in urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
                continue
            if len(href) > 60:
                urls.add(href)
        return urls
    
    urls_cna = get_all_links('https://www.channelnewsasia.com/')
    url = random.sample(urls_cna, 1)
    print("Found URL: ", url[0])
    
    page1 = requests.get(cna_text1)
    soup1 = BeautifulSoup(page1.content, "html.parser")
    text = list(soup1.find_all("p"))
    cna_text = [txt.get_text() for txt in text]
   
    return cna_text


#### Cleaning text data

In [172]:
sof_text1 = getSOFurl()
sof_text2 = getSOFurl()

HTTP Error 404: Not Found
Found URL: https://stackoverflow.com/questions/50282765
HTTP Error 404: Not Found
Found URL: https://stackoverflow.com/questions/15773218


In [173]:
hwz_text1 = getHWZurl()
hwz_text2 = getHWZurl()

https://forums.hardwarezone.com.sg/login/
<urlopen error [Errno 11001] getaddrinfo failed>
/tech-news/list
/feature-review-netflixs-squid-game-throws-likeable-characters-battle-royale
Found URL: https://www.hardwarezone.com.sg/feature-review-netflixs-squid-game-throws-likeable-characters-battle-royale
/product-guide/203-photography/home
https://forums.hardwarezone.com.sg/forumdisplay.php?f=23
<urlopen error [Errno 11001] getaddrinfo failed>
/feature-review-netflixs-squid-game-throws-likeable-characters-battle-royale
Found URL: https://www.hardwarezone.com.sg/feature-review-netflixs-squid-game-throws-likeable-characters-battle-royale


In [175]:
cna_text1 = getCNAurl()
cna_text2 = getCNAurl()

Found URL:  https://www.channelnewsasia.com/commentary/malaysia-covid-19-kita-jaga-kita-white-flag-help-politics-2179596


since Python 3.9 and will be removed in a subsequent version.
  url = random.sample(urls_cna, 1)


InvalidSchema: No connection adapters were found for '[\'\\n\\n      World\\n  \\n\', \'\\n\\n      World\\n  \\n\', \'Seventh grade student Ryza Delos Santos, 10, works on her modules at home as her cousin observes, after a session at the makeshift rickshaw distance learning centre for the Aeta community in Porac, Pampanga, Philippines, Oct 12, 2020. (File photo: REUTERS/Eloisa Lopez)\', "MANILA: The United Nations children\'s agency UNICEF has urged education authorities to reopen schools as soon as possible in countries where millions of students are still not allowed to return to classrooms 18 months into the COVID-19 pandemic.", \'Schools in around 17 countries remain fully closed, while those in 39 countries remain partially closed, according to a report released by UNICEF on Thursday (Sep 16).\', \'Among those "almost completely closed" are schools usually attended by nearly 77 million students in the Philippines, Bangladesh, Venezuela, Saudi Arabia, Panama and Kuwait.\', "Nearly a third of this figure is accounted for by the Philippines, which is fighting one of Asia\'s worst COVID-19 outbreaks and where a new school year started this week.", \'Pupils from the six countries represent more than half of the 131 million students worldwide that have missed more than three-quarters of their in-person learning, UNICEF said.\', \'"The education crisis is still here, and with each passing day that classrooms remain dark, the devastation worsens," said UNICEF Executive Director Henrietta Fore.\', \'The report said teachers should be prioritised for COVID-19 vaccines, after health workers and those most at risk, to protect them from community transmission.\', \'Students may be safer at home, but the availability of computers, mobile phones and Internet, and the uneven quality of education, are among challenges they continue to face.\', \'In the Philippines, some children have been forced to climb onto roofs just to get an Internet signal.\', \'In June, President Rodrigo Duterte rejected a proposal to allow face-to-face classes to resume in some areas, saying: "I cannot gamble on the health of the children."\', "In a report released in April, the Asian Development Bank estimated school closures lasting more than a year could slash future earnings among the region\'s students by as much as US$1.25 trillion, or equivalent to 5.4 per cent of GDP in 2020.", \'UNICEF and its partners will shut down their digital channels for 18 hours on Thursday to draw attention to the crisis and the "18 months of lost learning".\', \'"This is a crisis we will not allow the world to ignore," UNICEF\\\'s Fore said. "Our channels are silent, but our message is loud: Every community, everywhere must reopen schools as soon as possible."\', \'\\xa0\', "\\n      This service is not intended for persons residing in the E.U. By clicking subscribe, I agree to receive news updates and promotional material from Mediacorp and Mediacorp\'s partners. \\n  ", \' Copyright© Mediacorp 2021. Mediacorp Pte Ltd. All rights reserved. \', "We know it\'s a hassle to switch browsers but we want your experience with CNA to be fast, secure and the best it can possibly be.", \'To continue, upgrade to a supported browser or, for the finest experience, download the mobile app.\', \'Upgraded but still having issues? Contact us\']'

##### Cleaning SOF text data

In [176]:
sof_text1 = sof_text1[4:]
sof_text1 = sof_text1[:-10]
temp_list = []
for line in sof_text1:
    temp_list += sent_tokenize(line)
sof_text1 = temp_list

In [177]:
sof_text1

["I'm trying to make subplots in pandas of certain graphs and i keep getting this error: float() argument must be a string or a number, not 'AxesSubplot' any ideas on what fix could be used?",
 'Code preview here',
 "Well, according to exception traceback, you'r trying to convert graph_sex_and_wellbeing tofloat.",
 'And float in Python can accept either string or number argument.',
 'graph_sex_and_wellbeing has typeAxesSubplot.',
 'You need to get a string or number from this variable and pass it to a float.']

In [178]:
sof_text2 = sof_text2[4:]
sof_text2 = sof_text2[:-10]
temp_list = []
for line in sof_text2:
    temp_list += sent_tokenize(line)
sof_text2 = temp_list

##### Cleaning CNA text data

##### Cleaning HWZ text data

In [179]:
hwz_text1 = hwz_text1[:-5]
temp_list = []
for line in hwz_text1:
    temp_list += sent_tokenize(line)
hwz_text1 = temp_list

In [180]:
hwz_text2 = hwz_text2[:-5]
temp_list = []
for line in hwz_text2:
    temp_list += sent_tokenize(line)
hwz_text2 = temp_list

##### First word in sentence capitalized?

In [181]:
count=0
uppercount=0
for sent in sof_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for sof_text1: ", uppercount/count)

Fraction of first letter being capitalised for sof_text1:  0.8333333333333334


In [182]:
count=0
uppercount=0
for sent in sof_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for sof_text2: ", uppercount/count)

Fraction of first letter being capitalised for sof_text2:  1.0


In [183]:
count=0
uppercount=0
for sent in hwz_text1:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for hwz_text1: ", uppercount/count)

Fraction of first letter being capitalised for hwz_text1:  0.9523809523809523


In [184]:
count=0
uppercount=0
for sent in hwz_text2:
    if sent[0].isupper():
        uppercount+=1
    count+=1
print("Fraction of first letter being capitalised for hwz_text2: ", uppercount/count)

Fraction of first letter being capitalised for hwz_text2:  0.9523809523809523


##### Length of articles?

In [185]:
print("No of sentences in sof_text1: ", len(sof_text1))
print("No of sentences in sof_text2: ", len(sof_text2))
print("No of sentences in hwz_text1: ", len(hwz_text1))
print("No of sentences in hwz_text2: ", len(hwz_text2))

No of sentences in sof_text1:  6
No of sentences in sof_text2:  11
No of sentences in hwz_text1:  42
No of sentences in hwz_text2:  42


##### Proper nouns capitalised?

In [186]:
tagged = []
uppercount = 0
count = 0
for sentence in sof_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in sof_text1: ', uppercount/count)

Fraction of proper nouns capitalised in sof_text1:  1.0


In [187]:
tagged = []
uppercount = 0
count = 0
for sentence in sof_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in sof_text2: ', uppercount/count)

Fraction of proper nouns capitalised in sof_text2:  0.0


In [188]:
tagged = []
uppercount = 0
count = 0
for sentence in hwz_text1:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in hwz_text1: ', uppercount/count)

Fraction of proper nouns capitalised in hwz_text1:  0.9230769230769231


In [189]:
tagged = []
uppercount = 0
count = 0
for sentence in hwz_text2:
    tagged.append(nlp(sentence))
for tag in tagged:
    for token in tag:
        if token.pos_ == 'PROPN':
            if token.text[0].isupper():
                uppercount += 1
            count += 1
print('Fraction of proper nouns capitalised in hwz_text2: ', uppercount/count)

Fraction of proper nouns capitalised in hwz_text2:  0.9230769230769231


###### What kind of proper nouns used?

1. Stack Overflow

In [190]:
sof_tagged = []
sof_noun_dict = {}
for sentence in sof_text1:
    sof_tagged.append(nlp(sentence))
for sentence in sof_text2:
    sof_tagged.append(nlp(sentence))
for tagged in sof_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in sof_noun_dict.keys():
                sof_noun_dict[token.text] += 1
            else:
                sof_noun_dict[token.text] = 1

sof_noun_dict_sorted = sorted(sof_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in sof_noun_dict_sorted:
    print(i[0], i[1])

form 5
string 3
number 3
line 3
float 2
argument 2
way 2
subplots 1
pandas 1
graphs 1
error 1
AxesSubplot 1
ideas 1
fix 1
Code 1
preview 1
exception 1
traceback 1
you'r 1
tofloat 1
Python 1
graph_sex_and_wellbeing 1
typeAxesSubplot 1
variable 1
icons 1
search 1
php 1
example 1
styles 1
style.css 1
searchform.php 1
element 1
display 1
inline 1
making 1


In [191]:
hwz_tagged = []
hwz_noun_dict = {}
for sentence in hwz_text1:
    hwz_tagged.append(nlp(sentence))
for sentence in hwz_text2:
    hwz_tagged.append(nlp(sentence))
for tagged in hwz_tagged:
    for token in tagged:
        if token.pos_ in ("PROPN", "NOUN"):
            if token.text in hwz_noun_dict.keys():
                hwz_noun_dict[token.text] += 1
            else:
                hwz_noun_dict[token.text] = 1

hwz_noun_dict_sorted = sorted(hwz_noun_dict.items(), key=lambda x: x[1], reverse=True)

for i in hwz_noun_dict_sorted:
    print(i[0], i[1])

Squid 20
Game 20
people 16
game 12
games 12
show 10
episodes 10
money 8
story 8
Netflix 6
premise 6
prize 6
time 6
drama 4
week 4
themes 4
characters 4
lives 4
location 4
episode 4
’s 4
Light 4
center 4
Gi 4
hun 4
cast 4
one 4
winner 4
person 4
Fall 2
Guys 2
Hwang 2
Dong 2
hyuk 2
Fortress 2
Miss 2
Granny 2
parade 2
Originals 2
production 2
design 2
setpieces 2
battle 2
royale 2
gauntlet 2
minigames 2
sum 2
subtlety 2
sledgehammer 2
thanks 2
choices 2
expectations 2
landing 2
centers 2
group 2
survival 2
cash 2
brink 2
need 2
unfold 2
pilot 2
tour 2
motivation 2
sneak 2
peek 2
brutality 2
participants 2
fun 2
air 2
sea 2
terms 2
participation 2
trailer 2
sense 2
unease 2
point 2
environments 2
numbers 2
humans 2
faceless 2
grunts 2
suits 2
guns 2
arena 2
field 2
doll 2
kid 2
Red 2
Green 2
bullets 2
winning 2
cost 2
chase 2
stakes 2
morbid 2
entertainment 2
thing 2
protagonist 2
gambling 2
habit 2
debt 2
loan 2
sharks 2
family 2
mother 2
daughter 2
heart 2
tale 2
humour 2
levity 2
situat

##### Good grammar?
1. Tense matching
2. Subject-verb agreement

#### Subject-verb agreement

In [196]:
# from https://github.com/pes10k/cs412-scorer/blob/master/agreement_utils.py
singular_noun_tags = ('NN', 'NNP')
plural_noun_tags = ('NNS', 'NNPS')

noun_tags = singular_noun_tags + plural_noun_tags + ('PRP',)

plural_verb_tags = ('VBZ',)
singular_verb_tags = ('VBP',)
general_verb_tags = ('VBD', 'VB')

verb_tags = singular_verb_tags + plural_verb_tags + general_verb_tags

plural_prop_nouns = ('they', 'we', 'them', 'themselves', 'us', 'those')
singular_prop_nouns = ('he', 'she', 'i', 'him', 'me', 'myself', 'it')

singulars = singular_noun_tags + singular_verb_tags + singular_prop_nouns
plurals = plural_noun_tags + plural_verb_tags + plural_prop_nouns

# HELP!

In [206]:
agreed = 0
disagreed = 0
agree_pairs = []
disagree_pairs = []
for sentence in sof_text1:
    sentence = sentence.lower().split()
    tagged = nltk.pos_tag(sentence)
    for index, word_tag_pair in enumerate(tagged):
        if word_tag_pair[1] in singulars:
            for j in range(index, len(tagged)):
                if tagged[j][1] in singulars:
                    agree_pairs.append((word_tag_pair[0], tagged[j][0]))
                    agreed += 1
                elif tagged[j][1] in plurals:
                    disagree_pairs.append((word_tag_pair[0], tagged[j][0]))
                    disagreed += 1
        elif word_tag_pair[1] in plurals:
            for j in range(index, len(tagged)):
                if tagged[j][1] in singulars:
                    disagree_pairs.append((word_tag_pair[0], tagged[j][0]))
                    disagreed += 1
                elif tagged[j][1] in plurals:
                    agree_pairs.append((word_tag_pair[0], tagged[j][0]))
                    agreed += 1
print(agree_pairs)
print(disagree_pairs)
print("SVA ratio in sof_text1: ", agreed/(agreed+disagreed))
                    
            

[("i'm", "i'm"), ("i'm", 'pandas'), ("i'm", 'graphs'), ("i'm", 'keep'), ("i'm", 'error:'), ("i'm", 'float()'), ("i'm", 'argument'), ("i'm", 'string'), ("i'm", 'fix'), ('subplots', 'subplots'), ('subplots', 'ideas'), ('pandas', 'pandas'), ('pandas', 'graphs'), ('pandas', 'keep'), ('pandas', 'error:'), ('pandas', 'float()'), ('pandas', 'argument'), ('pandas', 'string'), ('pandas', 'fix'), ('graphs', 'graphs'), ('graphs', 'keep'), ('graphs', 'error:'), ('graphs', 'float()'), ('graphs', 'argument'), ('graphs', 'string'), ('graphs', 'fix'), ('keep', 'keep'), ('keep', 'error:'), ('keep', 'float()'), ('keep', 'argument'), ('keep', 'string'), ('keep', 'fix'), ('error:', 'error:'), ('error:', 'float()'), ('error:', 'argument'), ('error:', 'string'), ('error:', 'fix'), ('float()', 'float()'), ('float()', 'argument'), ('float()', 'string'), ('float()', 'fix'), ('argument', 'argument'), ('argument', 'string'), ('argument', 'fix'), ('string', 'string'), ('string', 'fix'), ('ideas', 'ideas'), ('fix'