In [147]:
# Import the libraries needed
from bs4 import BeautifulSoup
from collections import Counter
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
from urllib.request import urlopen
import re
import requests

In [148]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [149]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

# 3.3 Extraction of Indicative Adjective Phrases

# Looking at the Data

I think instead of looking at the data and just blindly applying some ML techniques. The first thing that we can do is to consider three things namely:

- Length of the review
- Grammar
- References, particularly inter and intra sentences

## How to do?

Firstly, we need to come up with a pipeline in which i think we should do the following:

1. Cleaning of data
2. Sentence segmentation
3. Fixing the references.. hmm we can do either de-referencing or we can do some coreferencing based on the dependencies

In [150]:
adjective_phrases = [
    "some extremely sunny", 
    "covered in mud", 
    "beautifully clear",
    "full of wonder",
    "overly sensitive",
    "fairly intelligent",
    "sick of these tirades",
    "more talented than you",
    "better than you",
    "pleased with himself",
    "angry with the high prices"
]

In [151]:
doc = nlp(adjective_phrases[0])
displacy.render(doc, style="dep")

In [152]:
doc = nlp(adjective_phrases[1])
displacy.render(doc, style="dep")

In [153]:
doc = nlp(adjective_phrases[2])
displacy.render(doc, style="dep")

In [154]:
doc = nlp(adjective_phrases[3])
displacy.render(doc, style="dep")

In [155]:
doc = nlp(adjective_phrases[4])
displacy.render(doc, style="dep")

In [156]:
doc = nlp(adjective_phrases[5])
displacy.render(doc, style="dep")

In [157]:
doc = nlp(adjective_phrases[6])
displacy.render(doc, style="dep")

In [158]:
doc = nlp(adjective_phrases[7])
displacy.render(doc, style="dep")

In [159]:
doc = nlp(adjective_phrases[8])
displacy.render(doc, style="dep")

In [160]:
doc = nlp("angry with the high prices")
displacy.render(doc, style="dep")

In [161]:
doc = nlp("The beauty of this song is marvelously")
displacy.render(doc, style="dep")

In [162]:
for token in nlp("The beauty of this song is marvelously clear"):
    print(token.text, token.pos_, token.dep_)
    for child in token.children:
        print("token:",token.text,  " child: ",  child)

The DET det
beauty NOUN nsubj
token: beauty  child:  The
token: beauty  child:  of
of ADP prep
token: of  child:  song
this DET det
song NOUN pobj
token: song  child:  this
is AUX ROOT
token: is  child:  beauty
token: is  child:  marvelously
token: is  child:  clear
marvelously ADV advmod
clear ADJ acomp


In [163]:
for token in nlp(adjective_phrases[4]):
    print(token.text, token.pos_, token.dep_)
    for child in token.children:
        print(token, child)

overly ADV advmod
sensitive ADJ ROOT
sensitive overly


From the results that we see above, we can see that these adjective phrases appear in a pattern of:
- ADV + ADJ
- ADJ + ADP + NOUN (the adjective was marked as a verb in the examples above... need to check)
- ADV + ADJ + DET/SCONJ/
in terms of their POS

So we can say that these adjective phrases have some rules of the form
- ADV/EPSILON + ADJ + ADP/EPSILON + DET/SCONJ/EPSILON + NOUN/PROPN/PRON/EPSILON

^^ however this would mean that a single adjective is an adjective phrase... need to think more

In terms of their dependencies,
- in the presence of an adverb, the adjective is the root and the adverb has an `advmod` dependency (since it modifies the adjective)
- since an adjective phrase is concerned with the noun insofar as it tells us more about the adjective, something like "sick cat" is not an adjective phrase
- thus, if a noun is included in an adjective phrase, there needs to be a preposition or a subordinating conjunction between the adjective and the noun
- in this case, the adjective will be the `root` and the preposition has a `prep` dependency, the noun will have a dependency of `pobj` with regards to the preposition or the subordinating conjunction

In [164]:
# to anyone who has the misfortune of reading this function... best wishes to you
# but in all seriousness, this is just a very rough version, i realise i need to make an inner function
# such that i can recursively get the position of the object in the adjective phrase

def get_adjective_phrases(text):
    doc = nlp(text)
    phrases = []
    for token in doc:
        phrase = ''
        """
        An adjective has the POS - ADJ and it can be the root word in a sentence (rare),
        it is more often in the form of `acomp` or `amod`, we immediately add this to our phrase
        since this is the main constituent of the adjective phrase
        """
        if (token.pos_ == 'ADJ') and (token.dep_ in ['ROOT', 'acomp', 'amod']):
            phrase += token.text
            adjective_position = token.i
            for subtoken in token.children:
                # first rule: if there is an adverb that modifies the adjective
                # we add it to the phrase in front of the adjective
                if (subtoken.pos_ == 'ADV') and (subtoken.dep_ == 'advmod'):
                    phrase = subtoken.text + ' ' + phrase
                # second rule: if there is a preposition - indicating that
                # there is an object that gives us more info about the prep
                # we add all tokens up until the noun
                if (subtoken.pos_ in ['ADP', 'SCONJ']) and (subtoken.dep_ == 'prep'):
                    for child in subtoken.children:
                        # if the child of the subtoken i.e. the preposition is an object
                        # we can end there
                        if (child.pos_ in ['NOUN', 'PRON', 'PROPN']) and (child.dep_ == 'pobj') :
                            noun_position = child.i
                            for i in range(adjective_position + 1, noun_position + 1):
                                phrase += ' ' + doc[i].text
                        # if it is a determiner, we need to see what the determiner's children are
                        elif (child.pos_ in ['DET']) and (child.dep_ == 'pobj') :
                            # If the determiner is directly connected to an object, we can immediately
                            # get to the end of the phrase
                            for subchild in child.children:
                                if subchild.dep_ == 'pobj' :
                                    noun_position = subchild.i
                                    for i in range(adjective_position + 1, noun_position + 1):
                                        phrase += ' ' + doc[i].text
                                # if the determiner is connected to a preposition, we need to get the
                                # child of the preposition and get the object
                                elif subchild.dep_ in ['prep', 'pcomp']:
                                    for subsubchild in subchild.children:
                                        if subsubchild.dep_ == 'pobj' :
                                            noun_position = subsubchild.i
                                            for i in range(adjective_position + 1, noun_position + 1):
                                                phrase += ' ' + doc[i].text
        # need to fix so that it does not get random adverbs and adjectives
        """elif (token.pos_ == 'AUX') and (token.dep_ == 'ROOT'):
            for subtoken in token.children:
                if subtoken.dep_ == 'advmod':
                    phrase += subtoken.text + ' '
                if subtoken.dep_ == 'acomp':
                    phrase += subtoken.text + ' '"""
        # since it is a phrase, it needs to have more than one word
        # i.e. a lone adjective does not constitute an adjective phrase
        if  len(phrase.split()) > 1:
            phrases.append(phrase)
    return phrases

# Pair Ranking

In order to do the pair ranking, we can look the top most frequent pairs -> potential downside is that useless adjective phrases will be generated which does not actually tell us anything about the shop

TF-IDF: this is commonly used for text summarization but we can use it to rank too. IDF is how common a particular phrase is in the entire corpus. Thus, a high TF-IDF would mean that this is particular to this shop

LDA: which is used for topic modelling, it can generate topics -> may be tried to see if an adjective phrase is particular

## Clustering

Clustering may be an interesting thing that we can try, especially since I doubt the things we see above will give us good results.

To do clustering, we need a continuous representation of the phrase. This can be done by generating word embeddings (word2vec, GloVe, BERT) or any other methods

We can then do K-Means clustering (benchmark comparison to the discrete methods above)

We can also do other clustering algos e.g. Hierarchical


In [167]:
sentences = [
    "Today is an extremely sunny day",
    "He is sick of all of this heat",
    "I am tired of Claudia",
    "The beauty of this song is marvelously clear",
    "He is angry with the high prices"
]
# need to find a way to handle the fourth case
adjective_phrases = [get_adjective_phrases(text) for text in sentences]
adjective_phrases

[['extremely sunny'],
 ['sick of all of this heat'],
 ['tired of Claudia'],
 [],
 ['angry with the high prices']]