In [2]:
# Import the libraries needed
from bs4 import BeautifulSoup
from collections import Counter
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
from urllib.request import urlopen
import re
import requests

In [6]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [4]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

# 3.3 Extraction of Indicative Adjective Phrases

# Looking at the Data

I think instead of looking at the data and just blindly applying some ML techniques. The first thing that we can do is to consider three things namely:

- Length of the review
- Grammar
- References, particularly inter and intra sentences

## How to do?

Firstly, we need to come up with a pipeline in which i think we should do the following:

1. Cleaning of data
2. Sentence segmentation
3. Fixing the references.. hmm we can do either de-referencing or we can do some coreferencing based on the dependencies

In [5]:
adjective_phrases = [
    "some extremely sunny", 
    "covered in mud", 
    "beautifully clear",
    "full of wonder",
    "overly sensitive",
    "fairly intelligent",
    "sick of these tirades",
    "more talented than you",
    "better than you",
    "pleased with himself",
    "angry with the high prices"
]

In [6]:
doc = nlp("The beauty of this song is marvelously clear")
displacy.render(doc, style="dep")

In [7]:
doc = nlp("The beauty of this song is amazing and marvelously clear")
displacy.render(doc, style="dep")

In [8]:
doc = nlp("The beauty of this song is amazingly and marvelously clear")
displacy.render(doc, style="dep")

In [9]:
doc = nlp("I am tired of dealing with all of the mess that has been created by those people")
displacy.render(doc, style="dep")

In [10]:
doc = nlp("he is sick of all the changes to the weather")
displacy.render(doc, style="dep")

In [11]:
for token in doc:
    print(token.text, token.dep_)
    for child in token.children:
        print("Token", token.text, "Child", child.text, child.dep_)

he nsubj
is ROOT
Token is Child he nsubj
Token is Child sick acomp
sick acomp
Token sick Child of prep
of prep
Token of Child changes pobj
all predet
the det
changes pobj
Token changes Child all predet
Token changes Child the det
Token changes Child to prep
to prep
Token to Child weather pobj
the det
weather pobj
Token weather Child the det


In [12]:
for token in nlp("I am tired of dealing with all of the mess that has been created by those people"):
    print(token.text, token.dep_)
    for child in token.children:
        print("Token", token.text, "Child", child.text, child.dep_)

I nsubj
am ROOT
Token am Child I nsubj
Token am Child tired acomp
tired acomp
Token tired Child of prep
of prep
Token of Child dealing pcomp
dealing pcomp
Token dealing Child with prep
with prep
Token with Child all pobj
all pobj
Token all Child of prep
of prep
Token of Child mess pobj
the det
mess pobj
Token mess Child the det
Token mess Child created relcl
that nsubjpass
has aux
been auxpass
created relcl
Token created Child that nsubjpass
Token created Child has aux
Token created Child been auxpass
Token created Child by agent
by agent
Token by Child people pobj
those det
people pobj
Token people Child those det


From the results that we see above, we can see that these adjective phrases appear in a pattern of:
- ADV + ADJ
- ADJ + ADP + NOUN (the adjective was marked as a verb in the examples above... need to check)
- ADV + ADJ + DET/SCONJ/
in terms of their POS

So we can say that these adjective phrases have some rules of the form
- ADV/EPSILON + ADJ + ADP/EPSILON + DET/SCONJ/EPSILON + NOUN/PROPN/PRON/EPSILON

^^ however this would mean that a single adjective is an adjective phrase... need to think more

In terms of their dependencies,
- in the presence of an adverb, the adjective is the root and the adverb has an `advmod` dependency (since it modifies the adjective)
- since an adjective phrase is concerned with the noun insofar as it tells us more about the adjective, something like "sick cat" is not an adjective phrase
- thus, if a noun is included in an adjective phrase, there needs to be a preposition or a subordinating conjunction between the adjective and the noun
- in this case, the adjective will be the `root` and the preposition has a `prep` dependency, the noun will have a dependency of `pobj` with regards to the preposition or the subordinating conjunction

In [19]:
def get_adjective_phrases(text) -> list:

	def get_to_object(token) -> int:
        for child in token.children:
            if (child.pos_ in ['NOUN', 'PRON', 'PROPN']) and (child.dep_ == 'pobj'):
                return child.i
            else:
                return get_to_object(child)
			# if (child.pos_ in ['NOUN', 'PRON', 'PROPN']) and (child.dep_ == 'pobj') :
			# subtoken = list(child.children)
			# subtoken = [token for token in subtoken if token.dep_ == 'prep']
			# if subtoken:
			# return get_to_object(subtoken[0])
			# else:

    doc = nlp(text)
    phrases = []
    for token in doc:
        phrase = ''
        """
        An adjective has the POS - ADJ and it can be the root word in a sentence (rare),
        it is more often in the form of `acomp` or `amod`, we immediately add this to our phrase
        since this is the main constituent of the adjective phrase
        """
        if (token.pos_ == 'ADJ') and (token.dep_ in ['ROOT', 'acomp', 'amod']):
            phrase += token.text
            adjective_position = token.i
            for subtoken in token.children:
                # first rule: if there is an adverb that modifies the adjective
                # we add it to the phrase in front of the adjective
                if (subtoken.pos_ == 'ADV') and (subtoken.dep_ == 'advmod'):
                    phrase = subtoken.text + ' ' + phrase
                # second rule: if there is a preposition - indicating that
                # there is an object that gives us more info about the prep
                # we add all tokens up until the noun
                if (subtoken.pos_ in ['ADP', 'SCONJ']) and (subtoken.dep_ == 'prep'):
                    noun_position = get_to_object(subtoken)
                    for i in range(adjective_position + 1, noun_position + 1):
                        phrase += ' ' + doc[i].text
        # need to fix so that it does not get random adverbs and adjectives
        """elif (token.pos_ == 'AUX') and (token.dep_ == 'ROOT'):
            for subtoken in token.children:
                if subtoken.dep_ == 'advmod':
                    phrase += subtoken.text + ' '
                if subtoken.dep_ == 'acomp':
                    phrase += subtoken.text + ' '"""
        # since it is a phrase, it needs to have more than one word
        # i.e. a lone adjective does not constitute an adjective phrase
        if  len(phrase.split()) > 1:
            phrases.append(phrase)
    return phrases


def get_adjective_phrases(text) -> list:

	def get_to_object(token) -> int:
		for child in token.children:
			# if (child.pos_ in ['NOUN', 'PRON', 'PROPN']) and (child.dep_ == 'pobj') :
			# subtoken = list(child.children)
			# subtoken = [token for token in subtoken if token.dep_ == 'prep']
			# if subtoken:
			# return get_to_object(subtoken[0])
			# else:
			if (child.pos_ in ['NOUN', 'PRON', 'PROPN']) and (child.dep_ == 'pobj') :
				return child.i
			else:
				return get_to_object(child)

	doc = nlp(text)
	phrases = []
	for token in doc:
		phrase = ''
		"""
		An adjective has the POS - ADJ and it can be the root word in a sentence (rare),
		it is more often in the form of `acomp` or `amod`, we immediately add this to our phrase
		since this is the main constituent of the adjective phrase
		"""
		if (token.pos_ == 'ADJ') and (token.dep_ in ['ROOT', 'acomp', 'amod']):
			phrase += token.text
			adjective_position = token.i
			for subtoken in token.children:
				# first rule: if there is an adverb that modifies the adjective
				# we add it to the phrase in front of the adjective
				if (subtoken.pos_ == 'ADV') and (subtoken.dep_ == 'advmod'):
					phrase = subtoken.text + ' ' + phrase
				# second rule: if there is a preposition - indicating that
				# there is an object that gives us more info about the prep
				# we add all tokens up until the noun
				if (subtoken.pos_ in ['ADP', 'SCONJ']) and (subtoken.dep_ == 'prep'):
					noun_position = get_to_object(subtoken)
					for i in range(adjective_position + 1, noun_position + 1):
						phrase += ' ' + doc[i].text
		# need to fix so that it does not get random adverbs and adjectives
		"""elif (token.pos_ == 'AUX') and (token.dep_ == 'ROOT'):
			for subtoken in token.children:
				if subtoken.dep_ == 'advmod':
					phrase += subtoken.text + ' '
				if subtoken.dep_ == 'acomp':
					phrase += subtoken.text + ' '"""
		# since it is a phrase, it needs to have more than one word
		# i.e. a lone adjective does not constitute an adjective phrase
		if  len(phrase.split()) > 1:
			phrases.append(phrase)
	return phrases

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 16)

# Pair Ranking

In order to do the pair ranking, we can look the top most frequent pairs -> potential downside is that useless adjective phrases will be generated which does not actually tell us anything about the shop

TF-IDF: this is commonly used for text summarization but we can use it to rank too. IDF is how common a particular phrase is in the entire corpus. Thus, a high TF-IDF would mean that this is particular to this shop

LDA: which is used for topic modelling, it can generate topics -> may be tried to see if an adjective phrase is particular

## Clustering

Clustering may be an interesting thing that we can try, especially since I doubt the things we see above will give us good results.

To do clustering, we need a continuous representation of the phrase. This can be done by generating word embeddings (word2vec, GloVe, BERT) or any other methods

We can then do K-Means clustering (benchmark comparison to the discrete methods above)

We can also do other clustering algos e.g. Hierarchical


In [22]:
sentences = ["I guess it's classified as fast food since the whole process is similar to what you'd experience at subway but the taste is far beyond your typical tofu wrap."]
# need to find a way to handle the fourth case
adjective_phrases = [get_adjective_phrases(text) for text in sentences]
adjective_phrases
# according to this.. acquaintance of mind, it's fine to not extract long phrases since they won't be that common anyway

to
experience
what


TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

In [13]:
def link_children(token, positions, phrases):
    for child in token.children:
        if positions[child.i][1] == 'ADV' and positions[child.i + 1][1] == 'ADJ':
            adv = positions[child.i][0].text
            adj = positions[child.i+1][0].text
            phrases.append(adv + ' ' + adj)
    return phrases
doc = "this is an incredibly rare opportunity"
positions = {token.i: [token, token.pos_] for token in nlp(doc)}
phrases = []
for token in nlp(doc):
    print(positions[token.i][1])
    phrases = link_children(token, positions, phrases)
phrases

DET
AUX
DET
ADV
ADJ
NOUN


['incredibly rare']