In [1]:
import pandas as pd
import numpy as np

#### Google drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path = '/content/drive/MyDrive/final_project_itc/data.npy'
data = np.load(path, allow_pickle = True)

In [4]:
df = pd.DataFrame(data, columns = ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'])

#### Keep only 3 features

In [5]:
df = df.loc[:,['ProductId','Text','Score']]

#### Split the dataset into train val and test using 100k samples

In [6]:
from sklearn.model_selection import GroupShuffleSplit
splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42)
split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId'])
train_inds, temp_inds = next(split_temp)

train = df.iloc[train_inds]
temp = df.iloc[temp_inds]



In [7]:
train.Score.value_counts(normalize = True)

5    0.630774
4    0.144436
1    0.091830
3    0.077940
2    0.055020
Name: Score, dtype: float64

In [8]:
splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42)
split_val = splitter_val.split(temp, groups=temp['ProductId'])
val_inds, test_inds = next(split_val)

val = temp.iloc[val_inds]
test = temp.iloc[test_inds]

In [9]:
val.Score.value_counts(normalize = True)

5    0.609850
4    0.151715
1    0.094591
3    0.086352
2    0.057491
Name: Score, dtype: float64

In [10]:
test.Score.value_counts(normalize = True)

5    0.619691
4    0.146604
1    0.095774
3    0.082265
2    0.055667
Name: Score, dtype: float64

In [11]:
train.shape,val.shape,test.shape

((59251, 3), (21725, 3), (19024, 3))

In [12]:
X_train = train.drop(columns = 'Score')
y_train = train.Score

X_val = val.drop(columns = 'Score')
y_val = val.Score

X_test = test.drop(columns = 'Score')
y_test = test.Score

#### NER model

In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [14]:
nbr_sent = 500

##### displacy plots

In [15]:
from spacy import displacy
displacy.render(nlp(X_train.Text[0]), jupyter = True)

##### all dependencies

In [16]:
from tqdm import tqdm
acomps = []
advcl = [] # verbs : not interesting
advmod = []# adverb that modify the verb : very,so,much,only,better,just,...
amod = [] # adjective that modify the noun : good,great,best,little,hot
ccomp = [] # complement of a verb : recomment, buying,got,...
compound = [] # word formed by a combination of words : saltwater,supermarket
conj = [] # second verb : (he sings and dances -> dances) : tasted,smokey,...
dobj = [] # object after a verb : product, flavor, tea
intj = [] # interjections : Oh,well,NO
nmod = [] # not interesting
for i in tqdm(range(nbr_sent)):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    if token.dep_ == 'acomp':
      acomps.append(token.text)
    elif token.dep_ == 'advcl':
      advcl.append(token.text)
    elif token.dep_ == 'advmod':
      advmod.append(token.text)
    elif token.dep_ == 'amod':
      amod.append(token.text)
    elif token.dep_ == 'ccomp':
      ccomp.append(token.text)
    elif token.dep_ == 'compound':
      compound.append(token.text)
    elif token.dep_ == 'conj':
      conj.append(token.text)
    elif token.dep_ == 'dobj':
      dobj.append(token.text)
    elif token.dep_ == 'intj':
      intj.append(token.text)
    elif token.dep_ == 'nmod':
      nmod.append(token.text)

100%|██████████| 500/500 [00:15<00:00, 32.34it/s]


In [17]:
from collections import Counter
counter_intj = Counter(intj)
counter_intj.most_common(10)

[('Well', 8),
 ('Oh', 6),
 ('like', 6),
 ('/>We', 4),
 ('wow', 4),
 ('Yes', 3),
 ('Please', 3),
 ('please', 3),
 ('well', 2),
 ('/><br', 2)]

In [18]:
counter_advcl = Counter(advmod)
counter_advcl.most_common(10)

[('very', 123),
 ('so', 121),
 ('just', 113),
 ('too', 97),
 ('when', 76),
 ('really', 71),
 ('also', 67),
 ('as', 57),
 ('much', 52),
 ('more', 48)]

In [19]:
counter_amod = Counter(amod)
counter_amod.most_common(20)

[('other', 98),
 ('great', 72),
 ('good', 71),
 ('best', 45),
 ('many', 32),
 ('local', 31),
 ('little', 30),
 ('first', 28),
 ('more', 27),
 ('only', 26),
 ('regular', 25),
 ('sweet', 25),
 ('natural', 23),
 ('big', 21),
 ('small', 20),
 ('favorite', 20),
 ('free', 19),
 ('whole', 19),
 ('better', 19),
 ('spicy', 18)]

In [20]:
counter_ccomp = Counter(ccomp)
counter_ccomp.most_common(10)

[('is', 82),
 ('are', 61),
 ('was', 34),
 ('have', 24),
 ('were', 20),
 ('be', 19),
 ("'s", 18),
 ('had', 15),
 ('get', 12),
 ('like', 10)]

In [21]:
counter_dobj = Counter(dobj)
counter_dobj.most_common(10)

[('it', 200),
 ('them', 147),
 ('chips', 96),
 ('these', 72),
 ('bag', 51),
 ('flavor', 51),
 ('this', 46),
 ('product', 36),
 ('taste', 36),
 ('what', 31)]

##### example of a sentence and all spacy attributes

In [22]:
test_sent = nlp('This girl is beautiful')
for token in test_sent:
  print(token.text, token.dep_)
  #print(token.dep_)
  print([child.text for child in token.children])
  print([child.pos_ for child in token.children])
  print([child.dep_ for child in token.children])

This det
[]
[]
[]
girl nsubj
['This']
['DET']
['det']
is ROOT
['girl', 'beautiful']
['NOUN', 'ADJ']
['nsubj', 'acomp']
beautiful acomp
[]
[]
[]


In [23]:
test_sent = nlp('she is a very beautiful girl')
for token in test_sent:
  print(token.text, token.dep_)
  #print(token.dep_)
  print([child.text for child in token.children])
  print([child.pos_ for child in token.children])
  print([child.dep_ for child in token.children])

she nsubj
[]
[]
[]
is ROOT
['she', 'girl']
['PRON', 'NOUN']
['nsubj', 'attr']
a det
[]
[]
[]
very advmod
[]
[]
[]
beautiful amod
['very']
['ADV']
['advmod']
girl attr
['a', 'beautiful']
['DET', 'ADJ']
['det', 'amod']


##### noun + verb + adjective

In [24]:
sub_ver_adj = []
for i in tqdm(range(nbr_sent)):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    noun = ''
    adj = ''
    if token.dep_ == 'ROOT':
      for child in token.children:
        if child.pos_ == 'NOUN':
          noun = child.text
        elif child.pos_ == 'ADJ':
          adj = child.text
      if noun and adj:
        sub_ver_adj.append((noun, token.text,adj))

100%|██████████| 500/500 [00:12<00:00, 41.09it/s]


In [25]:
counter_sva = Counter(sub_ver_adj)
counter_sva.most_common(20)

[(('chips', 'are', 'unmistakable'), 5),
 (('chips', 'are', 'crunchy'), 4),
 (('price', 'is', 'good'), 3),
 (('chips', 'are', 'excellent'), 3),
 (('chips', 'are', 'delicious'), 3),
 (('chips', 'are', 'spicy'), 3),
 (('taste', 'is', 'great'), 2),
 (('chips', 'are', 'bad'), 2),
 (('chips', 'are', 'tasty'), 2),
 (('flavor', 'is', 'medicinal'), 1),
 (('taffy', 'was', 'enjoyable'), 1),
 (('taffy', 'is', 'good'), 1),
 (('flavors', 'are', 'amazing'), 1),
 (('cracks', 'are', 'distinct'), 1),
 (('aroma', 'is', 'strong'), 1),
 (('smell', 'is', 'wonderful'), 1),
 (('coffee', 'is', 'smooth'), 1),
 (('jam', 'was', 'good'), 1),
 (('guests', 'were', 'impressed'), 1),
 (('taste', 'was', 'great'), 1)]

##### noun + amod adj

In [26]:
adj_noun = []
for i in tqdm(range(nbr_sent)):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    if token.dep_ == 'attr':
      for child in token.children:
        if child.dep_ == 'amod': # adj that modify the noun
          adj_noun.append((token.text,child.text))


100%|██████████| 500/500 [00:12<00:00, 41.03it/s]


In [27]:
adj_noun

[('mouthful', 'tiny'),
 ('assortment', 'wide'),
 ('pieces', 'much'),
 ('pieces', 'black'),
 ('pieces', 'flavored'),
 ('treat', 'delightful'),
 ('food', 'healthy'),
 ('velvety', 'smooth'),
 ('packaging', 'insulating'),
 ('tart', 'sour'),
 ('flavor', 'excellent'),
 ('cat', 'posh'),
 ('food', 'good'),
 ('cat', 'big'),
 ('brands', 'other'),
 ('purchase', 'great'),
 ('food', 'excellent'),
 ('weight', 'loosing'),
 ('solution', 'best'),
 ('one', 'only'),
 ('one', 'other'),
 ('kid', 'little'),
 ('situation', 'amazing'),
 ('snack', 'GREAT'),
 ('deal.<br', 'good'),
 ('size', 'certain'),
 ('marinade', 'fabulous'),
 ('things', 'many'),
 ('pack', 'tasting'),
 ('basil', 'pleasant'),
 ('basil', 'much'),
 ('option', 'great'),
 ('products', 'tasting'),
 ('gem', 'hidden'),
 ('product', 'great'),
 ('choice', 'good'),
 ('choice', 'second'),
 ('bonus.<br', 'huge'),
 ('taste', 'slight'),
 ('ingredients', 'organic'),
 ('value', 'good'),
 ('value', 'bulk'),
 ('deal', 'great'),
 ('amount', 'right'),
 ('mint!<b

In [28]:
sub_ver_adv = []
for i in tqdm(range(nbr_sent)):
  sente = nlp(X_train.Text.iloc[i])
  attrib = ''
  subject = ''
  verb = ''
  adverb = ''
  adj = ''
  for token in sente:
    if token.dep_ == 'nsubj':
      subject = token.text
    elif token.dep_ == 'ROOT':
      verb = token.text
    elif token.dep_ == 'advmod':
      adverb = token.text
      adv_notext = token
    elif token.dep_ == 'amod':
      adj = token.text
    elif token.dep_ == 'attr':
      attrib = token.text
  if (subject and verb and adverb and adj and attrib):
    sub_ver_adv.append((subject,verb,adverb,adj, attrib))

100%|██████████| 500/500 [00:12<00:00, 41.03it/s]


In [29]:
sub_ver_adv

[('that', 'is', 'highly', 'yummy', 'treat'),
 ('this', 'is', 'very', 'yummy', 'deal'),
 ('it', 'was', 'only', 'delightful', 'treat'),
 ('dog', 'eats', 'Also', 'required', 'food'),
 ('They', 'Perfect', 'hot', 'nice', 'reviews'),
 ('taste', 'is', 'slightly', 'smooth', 'velvety'),
 ('guests', 'were', 'really', 'later', 'best'),
 ('I', 'order', 'again', 'few', 'grainy'),
 ('they', 'Order', 'very', 'cold', 'packaging'),
 ('it', 'worked', 'Also', 'own', 'problem'),
 ('it', "'s", 'just', 'sour', 'me'),
 ('them', 'loved', 'whenever', 'nice', 'slicing'),
 ('They', 'go', 'even', 'Amish', 'household'),
 ('this', 'Peterson', 'Now', 'good', 'idea'),
 ('they', 'ordered', 'back', 'female', 'cat'),
 ('I', 'recommend', 'highly', 'past', 'kitten'),
 ('she', 'is', 'only', 'old', 'food'),
 ('anyone', 'Had', 'so', 'sorry', 'lot'),
 ('it', "'s", 'well', 'best', 'solution'),
 ('that', 'is', 'very', 'finicky', 'food'),
 ('I', 'recommend', 'highly', 'most', 'definite'),
 ('food', 'reduces', 'also', 'tartar', '

##### nouns + adj & verb + adj

In [30]:
nouns_adj = []
verbs_adv = []
for i in tqdm(range(nbr_sent)):
  sente = nlp(X_train.Text.iloc[i])
  for token in sente:
    if token.pos_ == 'NOUN':
      for child in token.children:
        if child.pos_ == 'ADJ':
          nouns_adj.append((token.text,child.text))
    elif token.pos_ == 'VERB':
      for child in token.children:
        if child.pos_ == 'ADV':
          verbs_adv.append((token.text,child.text))




100%|██████████| 500/500 [00:11<00:00, 43.04it/s]


In [31]:
from collections import Counter
counter_na = Counter(nouns_adj)
counter_na.most_common(20)

[(('brands', 'other'), 16),
 (('chips', 'best'), 13),
 (('store', 'local'), 12),
 (('cream', 'sour'), 9),
 (('time', 'first'), 9),
 (('chips', 'other'), 8),
 (('thing', 'only'), 7),
 (('quality', 'good'), 6),
 (('stores', 'local'), 6),
 (('deal', 'great'), 6),
 (('amount', 'right'), 6),
 (('color', 'light'), 6),
 (('chips', 'regular'), 6),
 (('chips', 'favorite'), 6),
 (('price', 'great'), 5),
 (('food', 'dry'), 5),
 (('product', 'great'), 5),
 (('time', 'same'), 5),
 (('flavor', 'Great'), 5),
 (('fan', 'big'), 5)]

In [32]:
counter_va = Counter(verbs_adv)
counter_va.most_common(20)

[(('buy', 'again'), 21),
 (('recommend', 'highly'), 15),
 (('had', 'ever'), 14),
 (('tasted', 'ever'), 9),
 (('order', 'again'), 9),
 (('has', 'also'), 6),
 (('eaten', 'ever'), 6),
 (('recommend', 'also'), 6),
 (('like', 'really'), 5),
 (('buy', 'never'), 5),
 (('goes', 'nicely'), 5),
 (('subdue', 'Also'), 5),
 (('powering', 'over'), 5),
 (('fits', 'Classically'), 5),
 (('fits', 'perfectly'), 5),
 (('go', 'back'), 4),
 (('have', 'now'), 4),
 (('noted', 'recently'), 4),
 (('wrapped', 'individually'), 3),
 (('like', 'much'), 3)]