In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import re

#### Google drive


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = '/content/drive/MyDrive/Colab Notebooks/ITC/Final Project/data.npy'
data = np.load(path, allow_pickle = True)

In [5]:
df = pd.DataFrame(data, columns = ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'])

#### Keep only 3 features

In [6]:
df = df.loc[:,['ProductId','Text','Score']]

In [7]:
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)

#### Split the dataset into train val and test using 100k samples

In [8]:
from sklearn.model_selection import GroupShuffleSplit
splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42)
split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId'])
train_inds, temp_inds = next(split_temp)

train = df.iloc[train_inds]
temp = df.iloc[temp_inds]

In [9]:
splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42)
split_val = splitter_val.split(temp, groups=temp['ProductId'])
val_inds, test_inds = next(split_val)

val = temp.iloc[val_inds]
test = temp.iloc[test_inds]

In [10]:
X_train = train.drop(columns = 'Score')
y_train = train.Score

X_val = val.drop(columns = 'Score')
y_val = val.Score

X_test = test.drop(columns = 'Score')
y_test = test.Score

# Full Pipeline

In [11]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [12]:
# Define number of samples
nbr_sent = 1000

# Define subject-pronouns to remove
personal_pronouns = ['i', 'me', 'you', 'he', 'she', 'it', 'we', 'us', 'they', 'them']

*Wave 1*

In [13]:
# Finds root of sentence (usually verb) and returns phrases based on children
wave_1 = []
score_noun = []
indexes_noun = []
adv_verb_ind = [] # tackle the case 'delivery was REALLY NOT quick'
for i in tqdm(range(nbr_sent)):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    noun = ''
    adj = ''
    adverb = ''
    neg = ''
    adv_verb = ''
    if token.dep_ == 'ROOT':
      for child in token.children:
        if child.pos_ == 'NOUN':
          noun = child.text
        elif child.pos_ == 'ADJ':
          adj = child.text
          for other_child in child.children:
            if other_child.pos_ == 'ADV':
              adverb = other_child.text
        elif child.pos_ == 'ADV':
          adv_verb_ind.append((i,child.text))
          adv_verb = child.text
        elif child.pos_ == 'PART':
          neg = child.text


      if noun and adj:
        indexes_noun.append(i)
        score_noun.append(y_train.iloc[i])
        if adverb :
          wave_1.append((noun, token.text,adverb,adj))
        elif adv_verb and neg:
          wave_1.append((noun, token.text,adv_verb,neg,adj))
        elif neg:
          wave_1.append((noun, token.text,neg,adj))
        else:
          wave_1.append((noun, token.text,adj))

100%|██████████| 1000/1000 [00:21<00:00, 46.76it/s]


In [14]:
print(f"Percent of Phrases Captured:{(len(wave_1)/nbr_sent*100): .2f}")

Percent of Phrases Captured: 48.00


*Wave 2*

In [15]:
# Returns phrases using passive voice
wave_2 = []

for i in tqdm(range(nbr_sent)):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    # Check if the token is a verb
    if token.dep_ == 'ROOT':
        subject = ''
        dobj_text = ''
        adj_text = ''

        # Look for the subject of the verb
        for child in token.children:
            if child.dep_ in ['nsubj', 'nsubjpass'] and child.text.lower() not in personal_pronouns:  # nsubjpass for passive voice
                subject = child.text
            # Check if there is a direct object that is a noun
            elif child.dep_ == 'dobj' and child.pos_ == 'NOUN':
                dobj_text = child.text
                # Iterate over the children of the noun to find an adjective
                for grandchild in child.children:
                    if grandchild.dep_ == 'amod' and grandchild.pos_ == 'ADJ':
                        adj_text = grandchild.text

        if subject and dobj_text and adj_text:
          wave_2.append((subject, token.text, adj_text, dobj_text))
            # wave_2.append(f"{subject} {token.text} {adj_text} {dobj_text}")

100%|██████████| 1000/1000 [00:19<00:00, 50.24it/s]


In [16]:
final_phrases = wave_1 + wave_2
print(f"Percent of Phrases Captured:{(len(final_phrases)/nbr_sent*100): .2f}")

Percent of Phrases Captured: 58.10


In [18]:
# Top nouns ready for sentiment analysis
noun_counts = Counter(nlp(item[0])[0].lemma_ for item in final_phrases)

sorted_nouns = sorted(noun_counts.items(), key=lambda item: item[1], reverse=True)

for noun, count in sorted_nouns[:10]:
  print(f"{noun}: {count}")

chip: 62
flavor: 29
taste: 18
bag: 16
price: 15
product: 15
coffee: 11
tea: 10
potato: 10
food: 9


# Functions

In [19]:
def create_sub_phrases(X, nbr_sent):
  """Returns lists of tuple phrases"""
  wave_1 = []
  wave_2 = []

  for i in tqdm(range(nbr_sent)):
    sente = nlp(X.Text.iloc[i])
    for token in sente:
      noun = ''
      adj = ''
      adverb = ''
      neg = ''
      adv_verb = ''
      subject = ''
      dobj_text = ''
      adj_text = ''
      if token.dep_ == 'ROOT':
        for child in token.children:
          if child.pos_ == 'NOUN':
            noun = child.text
          elif child.pos_ == 'ADJ':
            adj = child.text
            for other_child in child.children:
              if other_child.pos_ == 'ADV':
                adverb = other_child.text
          elif child.pos_ == 'ADV':
            adv_verb_ind.append((i,child.text))
            adv_verb = child.text
          elif child.pos_ == 'PART':
            neg = child.text

          if child.dep_ in ['nsubj', 'nsubjpass'] and child.text.lower() not in personal_pronouns:  # nsubjpass for passive voice
            subject = child.text

          elif child.dep_ == 'dobj' and child.pos_ == 'NOUN':
            dobj_text = child.text
            for grandchild in child.children:
              if grandchild.dep_ == 'amod' and grandchild.pos_ == 'ADJ':
                  adj_text = grandchild.text

        if noun and adj:
          if adverb :
            wave_1.append((noun, token.text,adverb,adj))
          elif adv_verb and neg:
            wave_1.append((noun, token.text,adv_verb,neg,adj))
          elif neg:
            wave_1.append((noun, token.text,neg,adj))
          else:
            wave_1.append((noun, token.text,adj))

        if subject and dobj_text and adj_text:
          wave_2.append((subject, token.text, adj_text, dobj_text))

  return wave_1 + wave_2


def get_topics(phrases, num_topics):
  """Returns dictionary of topics with sorted by count"""
  noun_counts = Counter(nlp(item[0])[0].lemma_ for item in phrases)
  sorted_nouns = sorted(noun_counts.items(), key=lambda item: item[1], reverse=True)

  for noun, count in sorted_nouns[:num_topics]:
    print(f"{noun}: {count}")

def print_phrases(phrases, num_phrases):
  for phrase in phrases[:num_phrases]:
    print(phrase)

In [20]:
# Constructs sub phrases
sub_phrases = create_sub_phrases(X=X_train, nbr_sent=1000)

100%|██████████| 1000/1000 [00:19<00:00, 50.99it/s]


In [21]:
print(f"The data contains {len(sub_phrases)} phrases")

The data contains 581 phrases


In [53]:
# Constructs topics
topics = get_topics(phrases=sub_phrases, num_topics=20)

chip: 62
flavor: 29
taste: 18
bag: 16
price: 15
product: 15
coffee: 11
tea: 10
potato: 10
food: 9
stuff: 7
one: 7
size: 6
amount: 6
brand: 5
pack: 5
package: 5
texture: 5
son: 5
licorice: 4
