In [1]:
# Import the libraries needed
from bs4 import BeautifulSoup
from collections import Counter
import itertools
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import random
import spacy
from spacy import displacy
from spacy.lang.en import English
from urllib.request import urlopen
import re
import requests

In [2]:
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [3]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

# 3.3 Extraction of Indicative Adjective Phrases

# Looking at the Data

I think instead of looking at the data and just blindly applying some ML techniques. The first thing that we can do is to consider three things namely:

- Length of the review
- Grammar
- References, particularly inter and intra sentences

## How to do?

Firstly, we need to come up with a pipeline in which i think we should do the following:

1. Cleaning of data
2. Sentence segmentation
3. Fixing the references.. hmm we can do either de-referencing or we can do some coreferencing based on the dependencies

In [23]:
adjective_phrases = [
    "some extremely sunny", 
    "covered in mud", 
    "beautifully clear",
    "full of wonder",
    "overly sensitive",
    "fairly intelligent",
    "sick of these tirades",
    "more talented than you",
    "better than you",
    "pleased with himself",
    "angry with the high prices"
]

In [13]:
doc = nlp(adjective_phrases[0])
displacy.render(doc, style="dep")

In [14]:
doc = nlp(adjective_phrases[1])
displacy.render(doc, style="dep")

In [15]:
doc = nlp(adjective_phrases[2])
displacy.render(doc, style="dep")

In [16]:
doc = nlp(adjective_phrases[3])
displacy.render(doc, style="dep")

In [17]:
doc = nlp(adjective_phrases[4])
displacy.render(doc, style="dep")

In [18]:
doc = nlp(adjective_phrases[5])
displacy.render(doc, style="dep")

In [19]:
doc = nlp(adjective_phrases[6])
displacy.render(doc, style="dep")

In [20]:
doc = nlp(adjective_phrases[7])
displacy.render(doc, style="dep")

In [21]:
doc = nlp(adjective_phrases[8])
displacy.render(doc, style="dep")

In [24]:
doc = nlp(adjective_phrases[9])
displacy.render(doc, style="dep")

In [24]:
doc = nlp(adjective_phrases[10])
displacy.render(doc, style="dep")

In [28]:
for token in doc:
    print(token.text, token.pos_, token.dep_)
    for child in token.children:
        print("token:",token.text,  " child: ",  child)

angry ADJ ROOT
token: angry  child:  with
with ADP prep
token: with  child:  prices
the DET det
high ADJ amod
prices NOUN pobj
token: prices  child:  the
token: prices  child:  high


In [8]:
for token in nlp(adjective_phrases[4]):
    print(token.text, token.pos_, token.dep_)
    for child in token.children:
        print(token, child)

overly ADV advmod
sensitive ADJ ROOT
sensitive overly


From the results that we see above, we can see that these adjective phrases appear in a pattern of:
- ADV + ADJ
- ADJ + ADP + NOUN (the adjective was marked as a verb in the examples above... need to check)
- ADV + ADJ + DET/SCONJ/
in terms of their POS

So we can say that these adjective phrases have some rules of the form
- ADV/EPSILON + ADJ + ADP/EPSILON + DET/SCONJ/EPSILON + NOUN/PROPN/PRON/EPSILON

^^ however this would mean that a single adjective is an adjective phrase... need to think more

In terms of their dependencies,
- in the presence of an adverb, the adjective is the root and the adverb has an `advmod` dependency (since it modifies the adjective)
- since an adjective phrase is concerned with the noun insofar as it tells us more about the adjective, something like "sick cat" is not an adjective phrase
- thus, if a noun is included in an adjective phrase, there needs to be a preposition or a subordinating conjunction between the adjective and the noun
- in this case, the adjective will be the `root` and the preposition has a `prep` dependency, the noun will have a dependency of `pobj` with regards to the preposition or the subordinating conjunction

In [20]:
def get_adjective_phrases(text):
    doc = nlp(text)
    phrases = []
    for token in doc:
        phrase = ''
        # first rule
        if (token.pos_ == 'ADJ') and (token.dep_ in ['ROOT', 'acomp']):
            for subtoken in token.children:
                if (subtoken.pos_ == 'ADV') and (subtoken.dep_ == 'advmod'):
                    phrase += subtoken.text + ' '
                # TODO: incorporate the second rule -- this is more complicated
            if len(phrase)!=0:
                phrase += token.text
        if  len(phrase)!=0:
            phrases.append(phrase)
    return phrases

# Pair Ranking

In order to do the pair ranking, we can look the top most frequent pairs -> potential downside is that useless adjective phrases will be generated which does not actually tell us anything about the shop

TF-IDF: this is commonly used for text summarization but we can use it to rank too. IDF is how common a particular phrase is in the entire corpus. Thus, a high TF-IDF would mean that this is particular to this shop

LDA: which is used for topic modelling, it can generate topics -> may be tried to see if an adjective phrase is particular

## Clustering

Clustering may be an interesting thing that we can try, especially since I doubt the things we see above will give us good results.

To do clustering, we need a continuous representation of the phrase. This can be done by generating word embeddings (word2vec, GloVe, BERT) or any other methods

We can then do K-Means clustering (benchmark comparison to the discrete methods above)

We can also do other clustering algos e.g. Hierarchical


In [21]:
adjective_phrases = get_adjective_phrases("he is overly sensitive")
adjective_phrases

['overly sensitive']

In [19]:
for token in nlp("he is overly sensitive"):
    print(token.text, token.pos_, token.dep_)
    for child in token.children:
        print("token:",token.text,  " child: ",  child)

he PRON nsubj
is AUX ROOT
token: is  child:  he
token: is  child:  sensitive
overly ADV advmod
sensitive ADJ acomp
token: sensitive  child:  overly
