In [1]:
import pandas as pd
import numpy as np

#### Google drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/final_project_itc/data.npy'
data = np.load(path, allow_pickle = True)

In [4]:
df = pd.DataFrame(data, columns = ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'])

#### Keep only 3 features

In [5]:
df = df.loc[:,['ProductId','Text','Score']]

#### Remove html tags

In [6]:
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)

#### Split the dataset into train val and test using 100k samples

In [7]:
from sklearn.model_selection import GroupShuffleSplit
splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42)
split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId'])
train_inds, temp_inds = next(split_temp)

train = df.iloc[train_inds]
temp = df.iloc[temp_inds]



In [8]:
splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42)
split_val = splitter_val.split(temp, groups=temp['ProductId'])
val_inds, test_inds = next(split_val)

val = temp.iloc[val_inds]
test = temp.iloc[test_inds]

In [9]:
X_train = train.drop(columns = 'Score')
y_train = train.Score

X_val = val.drop(columns = 'Score')
y_val = val.Score

X_test = test.drop(columns = 'Score')
y_test = test.Score

#### NER model

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [11]:
nbr_sent = 500

##### displacy plots

##### all dependencies

In [12]:
from tqdm import tqdm
acomps = []
advcl = [] # verbs : not interesting
advmod = []# adverb that modify the verb : very,so,much,only,better,just,...
amod = [] # adjective that modify the noun : good,great,best,little,hot
ccomp = [] # complement of a verb : recomment, buying,got,...
compound = [] # word formed by a combination of words : saltwater,supermarket
conj = [] # second verb : (he sings and dances -> dances) : tasted,smokey,...
dobj = [] # object after a verb : product, flavor, tea
intj = [] # interjections : Oh,well,NO
nmod = [] # not interesting
for i in tqdm(range(nbr_sent)):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    if token.dep_ == 'acomp':
      acomps.append(token.text)
    elif token.dep_ == 'advcl':
      advcl.append(token.text)
    elif token.dep_ == 'advmod':
      advmod.append(token.text)
    elif token.dep_ == 'amod':
      amod.append(token.text)
    elif token.dep_ == 'ccomp':
      ccomp.append(token.text)
    elif token.dep_ == 'compound':
      compound.append(token.text)
    elif token.dep_ == 'conj':
      conj.append(token.text)
    elif token.dep_ == 'dobj':
      dobj.append(token.text)
    elif token.dep_ == 'intj':
      intj.append(token.text)
    elif token.dep_ == 'nmod':
      nmod.append(token.text)

100%|██████████| 500/500 [00:13<00:00, 37.62it/s]


In [13]:
from collections import Counter
counter_intj = Counter(intj)
counter_intj.most_common(10)

[('Well', 8),
 ('Oh', 6),
 ('like', 6),
 ('Yes', 4),
 ('wow', 4),
 ('Please', 3),
 ('please', 3),
 ('well', 2),
 ('OK', 2),
 ('Hey', 1)]

In [14]:
counter_advcl = Counter(advmod)
counter_advcl.most_common(10)

[('very', 123),
 ('so', 121),
 ('just', 113),
 ('too', 98),
 ('when', 76),
 ('really', 71),
 ('also', 67),
 ('as', 61),
 ('much', 51),
 ('more', 49)]

In [15]:
counter_amod = Counter(amod)
counter_amod.most_common(20)

[('other', 98),
 ('great', 72),
 ('good', 71),
 ('best', 46),
 ('many', 32),
 ('local', 31),
 ('little', 30),
 ('only', 29),
 ('first', 28),
 ('more', 27),
 ('regular', 25),
 ('sweet', 25),
 ('natural', 23),
 ('big', 21),
 ('Great', 20),
 ('small', 20),
 ('favorite', 20),
 ('free', 19),
 ('whole', 19),
 ('better', 19)]

In [16]:
counter_ccomp = Counter(ccomp)
counter_ccomp.most_common(10)

[('is', 83),
 ('are', 62),
 ('was', 33),
 ('have', 22),
 ('were', 21),
 ('be', 18),
 ("'s", 17),
 ('get', 12),
 ('had', 11),
 ('has', 9)]

In [17]:
counter_dobj = Counter(dobj)
counter_dobj.most_common(10)

[('it', 202),
 ('them', 150),
 ('chips', 98),
 ('these', 74),
 ('flavor', 52),
 ('bag', 50),
 ('this', 45),
 ('taste', 37),
 ('product', 35),
 ('me', 31)]

##### example of a sentence and all spacy attributes

In [18]:
test_sent = nlp('This girl is beautiful')
for token in test_sent:
  print(token.text, token.dep_)
  #print(token.dep_)
  print([child.text for child in token.children])
  print([child.pos_ for child in token.children])
  print([child.dep_ for child in token.children])

This det
[]
[]
[]
girl nsubj
['This']
['DET']
['det']
is ROOT
['girl', 'beautiful']
['NOUN', 'ADJ']
['nsubj', 'acomp']
beautiful acomp
[]
[]
[]


In [19]:
test_sent = nlp('she is a very beautiful girl')
for token in test_sent:
  print(token.text, token.dep_)
  #print(token.dep_)
  print([child.text for child in token.children])
  print([child.pos_ for child in token.children])
  print([child.dep_ for child in token.children])

she nsubj
[]
[]
[]
is ROOT
['she', 'girl']
['PRON', 'NOUN']
['nsubj', 'attr']
a det
[]
[]
[]
very advmod
[]
[]
[]
beautiful amod
['very']
['ADV']
['advmod']
girl attr
['a', 'beautiful']
['DET', 'ADJ']
['det', 'amod']


##### noun + verb + adjective

In [12]:
from tqdm import tqdm
noun_ver_adj = []
score_noun = []
indexes_noun = []
adv_verb_ind = [] # tackle the case 'delivery was REALLY NOT quick'
for i in tqdm(range(1000)):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    noun = ''
    adj = ''
    adverb = ''
    neg = ''
    adv_verb = ''
    if token.dep_ == 'ROOT':
      for child in token.children:
        if child.pos_ == 'NOUN':
          noun = child.text
        elif child.pos_ == 'ADJ':
          adj = child.text
          for other_child in child.children:
            if other_child.pos_ == 'ADV':
              adverb = other_child.text
        elif child.pos_ == 'ADV':
          adv_verb_ind.append((i,child.text))
          adv_verb = child.text
        elif child.pos_ == 'PART':
          neg = child.text



      if noun and adj:
        indexes_noun.append(i)
        score_noun.append(y_train.iloc[i])
        if adverb :
          noun_ver_adj.append((noun, token.text,adverb,adj))
        elif adv_verb and neg:
          noun_ver_adj.append((noun, token.text,adv_verb,neg,adj))
        elif neg:
          noun_ver_adj.append((noun, token.text,neg,adj))
        else:
          noun_ver_adj.append((noun, token.text,adj))

100%|██████████| 1000/1000 [00:28<00:00, 35.27it/s]


In [13]:
len(indexes_noun)

480

In [14]:
from collections import Counter
element_counts_lemma_noun = Counter(nlp(item[0])[0].lemma_ for item in noun_ver_adj)

In [15]:
most_common_noun = list(map(lambda x : x[0],element_counts_lemma_noun.most_common(10)))

In [16]:
flavor_tuples = [' '.join(x) for x in noun_ver_adj if nlp(x[0])[0].lemma_ == most_common_noun[0]]

##### Sentiment analysis





In [17]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [18]:
most_common_noun

['chip',
 'flavor',
 'taste',
 'price',
 'bag',
 'product',
 'coffee',
 'potato',
 'tea',
 'food']

In [19]:
def get_insights(index):
  list_tuples = [' '.join(x) for x in noun_ver_adj if nlp(x[0])[0].lemma_ == most_common_noun[index]]
  results = sentiment_pipeline(list_tuples)
  pos = 0
  neg = 0
  pos_adj = []
  neg_adj = []
  for sentence,result in zip(list_tuples,results):
    if result['label'] == 'POSITIVE':
      pos +=1
      pos_adj.append(sentence.rsplit(None,1)[-1])
    else:
      neg +=1
      neg_adj.append(sentence.rsplit(None,1)[-1])
  return most_common_noun[index],round(pos/len(results)*100,2), round(neg/len(results)*100,2), pos_adj, neg_adj

In [20]:
flavor = get_insights(0)

In [29]:
flavor[0],flavor[1],flavor[2]

('chip', 74.07, 25.93)

In [21]:
pos_adj_flavor = flavor[3]
Counter(pos_adj_flavor).most_common(10)

[('unmistakable', 5),
 ('good', 4),
 ('excellent', 3),
 ('delicious', 3),
 ('great', 3),
 ('crunchy', 2),
 ('awesome', 2),
 ('tasty', 2),
 ('spicy', 2),
 ('best', 2)]

In [22]:
neg_adj_flavor = flavor[4]
Counter(neg_adj_flavor).most_common(10)

[('bad', 2),
 ('crunchy', 2),
 ('tangy', 2),
 ('thick', 1),
 ('overcooked', 1),
 ('hard', 1),
 ('nasty', 1),
 ('greasy', 1),
 ('spicy', 1),
 ('fresh', 1)]

In [23]:
taste = get_insights(1)

In [24]:
pos_adj_taste = taste[3]
Counter(pos_adj_taste).most_common(5)

[('amazing', 2),
 ('great', 2),
 ('medicinal', 1),
 ('Other', 1),
 ('fantastic', 1)]

In [25]:
neg_adj_taste = taste[4]
Counter(neg_adj_taste).most_common(5)

[('much', 1), ('terrible', 1), ('earthier', 1), ('dry', 1)]

In [26]:
price = get_insights(2)

In [27]:
pos_adj_price = price[3]
Counter(pos_adj_price).most_common(5)

[('great', 4), ('Fresh', 1), ('easy', 1), ('decent', 1), ('perfect', 1)]

In [28]:
neg_adj_price = price[4]
Counter(neg_adj_price).most_common(5)

[('complicated', 1),
 ('better', 1),
 ('Stale', 1),
 ('overwhelming', 1),
 ('harsh', 1)]

##### Subj ver adj

In [21]:
index_ok = np.unique(indexes_noun)
all_index = np.arange(500)
index_notok = np.setdiff1d(all_index,indexes_noun)

In [22]:
subj_ver_adj = []
score_subj = []
indexes_subj = []
for i in tqdm(index_notok):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    subj = ''
    adj = ''
    adverb = ''
    neg = ''
    if token.dep_ == 'ROOT':
      for child in token.children:
        if child.dep_ == 'nsubj':
          subj = child.text
        elif child.dep_ == 'neg':
          neg = child.text
        elif child.dep_ == 'acomp':
          adj = child.text
          for other_child in child.children:
            if other_child.pos_ == 'ADV':
              adverb = other_child.text

      if subj and adj:
        indexes_subj.append(i)
        score_subj.append(y_train.iloc[i])
        if adverb and neg:
          subj_ver_adj.append((subj, token.text,adverb,neg,adj,i))
        elif adverb:
          subj_ver_adj.append((subj, token.text,adverb,adj,i))
        elif neg:
          subj_ver_adj.append((subj, token.text,neg,adj,i))
        else:
          subj_ver_adj.append((subj, token.text,adj,i))

100%|██████████| 293/293 [00:09<00:00, 29.75it/s]


In [31]:
element_counts_lemma = Counter(nlp(item[0])[0].lemma_ for item in subj_ver_adj)

In [None]:
#element_counts = Counter(item[0] for item in sub_ver_adj)

In [32]:
element_counts_lemma.most_common(10)

[('it', 35),
 ('I', 35),
 ('they', 32),
 ('these', 12),
 ('this', 5),
 ('we', 4),
 ('chip', 2),
 ('Labrador', 1),
 ('delivery', 1),
 ('he', 1)]