In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/My Drive/SemEval/repo/ReCAM/notebooks'

/content/drive/My Drive/SemEval/repo/ReCAM/notebooks


In [3]:
%%capture
!pip install transformers

In [4]:
import sys
sys.path.append('..')
sys.path.append('../src')
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import torch

In [5]:
# load the dataset

from datasets.concreteness_dataset import ConcretenessDataset

dataset = ConcretenessDataset('../data/imperceptibility/Concreteness Ratings/train/forty.csv',None,)

In [6]:
from scipy.stats import pearsonr
from scipy.stats import spearmanr

def pearson_spearman(lst_1, lst_2):
    pearson_score, _ = pearsonr(lst_1,lst_2) 
    spearman_score, _ = spearmanr(lst_1,lst_2)
    return pearson_score, spearman_score

# PAPER #1
http://184pc128.csie.ntnu.edu.tw/presentation/13-08-27/Estimating%20Content%20Concreteness%20for%20Finding%20Comprehensible%20Documents.pdf

### STAT \#1: Length of Word
In both Paper 1 and Paper 2

In [7]:
len_lst = []
reg_lst = []
for word, reg_score in dataset:
    len_lst.append(len(word))
    reg_lst.append(reg_score)

pearson_score, spearman_score = pearson_spearman(len_lst,reg_lst)

print(pearson_score, spearman_score)

# negatively correlated

-0.2926909583153332 -0.3134800217791011


### STAT \#2:  Number of Senses (Polysemy)
In both Paper 1 and Paper 2

In [23]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
senses_lst = []
reg_lst = []

for word, reg_score in dataset:
    if(len(word.split(' '))==1):
        senses_lst.append(len(wn.synsets(word)))
        reg_lst.append(reg_score)

print(len(senses_lst))
print(len(reg_lst))

pearson_score, spearman_score = pearson_spearman(senses_lst,reg_lst)
print(pearson_score, spearman_score)

# Hmmm, I was expecting a higher correlation here :/ Maybe synsets don't give
# an accurate number of senses of each word?
# why is the correlation positive though?

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
34734
34734
0.13881952987261648 0.18328963036368343


### STAT \#3: Number of Hyponyms

a) Number of hyponyms of the most common sense of the word

In [25]:
from itertools import chain
 
hyponyms_lst = []
reg_lst = []
for word, reg_score in dataset:
    if(len(word.split(' ')) == 1):
        if(len(wn.synsets(word))>0):
            j = wn.synsets(word)[0]
            no_of_hypos = len(list(chain(*[l.lemma_names() for l in j.hypernyms()])))
            hyponyms_lst.append(no_of_hypos)
            reg_lst.append(reg_score)
        else:
            hyponyms_lst.append(0)
            reg_lst.append(reg_score)

print(len(senses_lst))
print(len(reg_lst))

pearson_score, spearman_score = pearson_spearman(hyponyms_lst,reg_lst)
print(pearson_score, spearman_score)

# decent correlation, but why is it positive?

34734
29382
0.19267164306960288 0.29273847963204896


b) Average number of hyponyms (averaged over number of hyponyms of all senses of the word)

In [12]:
from itertools import chain

hyponyms_lst = []
reg_lst = []
for word, reg_score in dataset:
    if(len(word.split(' ')) == 1):
        i = 0
        for j in wn.synsets(word):
            no_of_hypos += len(list(chain(*[l.lemma_names() for l in j.hypernyms()])))
            i += 1
        if(i>0):
            hyponyms_lst.append(no_of_hypos/i)
            reg_lst.append(reg_score)
        else:
            depth_lst.append(0)
            reg_lst.append(reg_score)

print(len(senses_lst))
print(len(reg_lst))

pearson_score, spearman_score = pearson_spearman(hyponyms_lst,reg_lst)
print(pearson_score, spearman_score)

# decent correlation

29382
29382
-0.24514482384665304 -0.16616096332156224


### STAT \#4: Depth in Ontology Tree (later)

# PAPER #2
https://alsl.gsu.edu/files/2014/03/Simulating-human-ratings-on-word-concreteness.pdf

### STAT \#5: Depth of Hypernymy Tree

a) Depth of Hypernymy Tree of the most common sense of the word

In [20]:
depth_lst = []
reg_lst = []

for word, reg_score in dataset:
    if(len(word.split(' ')) == 1):
        avg_len_paths = 0
        if(len(wn.synsets(word))>0):
            j = wn.synsets(word)[0]
            paths_to_top = j.hypernym_paths()
            max_len_path = len(max(paths_to_top, key = lambda i: len(i)))
            depth_lst.append(max_len_path)
            reg_lst.append(reg_score)
        else:
            depth_lst.append(0)
            reg_lst.append(reg_score)

print(len(depth_lst))
print(len(reg_lst))

pearson_score, spearman_score = pearson_spearman(depth_lst,reg_lst)
print(pearson_score, spearman_score)
            
# WOOHOO!

34734
34734
0.5025069814672712 0.4757908968971325


b) Average of the depths of the hypernymy trees of all the sense of the word

In [19]:
depth_lst = []
reg_lst = []

for word, reg_score in dataset:
    if(len(word.split(' ')) == 1):
        i = 0
        avg_len_paths = 0
        
        for j in wn.synsets(word):
            paths_to_top = j.hypernym_paths()
            max_len_path = len(max(paths_to_top, key = lambda k: len(k)))
            avg_len_paths += max_len_path
            i += 1
        if(i>0):
            avg_len_paths = avg_len_paths/i
            depth_lst.append(avg_len_paths)
            reg_lst.append(reg_score)
        else:
            depth_lst.append(0)
            reg_lst.append(reg_score)




print(len(depth_lst))
print(len(reg_lst))

pearson_score, spearman_score = pearson_spearman(depth_lst,reg_lst)
print(pearson_score, spearman_score)
            
# WOOHOO!

34734
34734
0.49473206642405476 0.4604110548336813


### STAT \#6: SentiWordNet Analysis (based on positive/negative sentiment scores


a) positive sentiment score of the most common sense of the word

b) negative sentiment score of the most common sense of the word

c) objective sentiment score of the most common sense of the word

In [30]:
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')
pos_sent_lst = []
neg_sent_lst = []
obj_sent_lst = []
reg_lst = []

for word, reg_score in dataset:
    if(len(word.split(' ')) == 1):

        if(len(list(swn.senti_synsets(word)))>0):
            j = list(swn.senti_synsets(word))[0]

            pos_sent_lst.append(j.pos_score())
            neg_sent_lst.append(j.neg_score())
            obj_sent_lst.append(j.obj_score())
            reg_lst.append(reg_score)
        else:
            pos_sent_lst.append(0)
            neg_sent_lst.append(0)
            obj_sent_lst.append(0)
            reg_lst.append(reg_score)

print(len(pos_sent_lst))
print(len(neg_sent_lst))
print(len(obj_sent_lst))
print(len(reg_lst))

pearson_score, spearman_score = pearson_spearman(pos_sent_lst,reg_lst)
print(pearson_score, spearman_score)

pearson_score, spearman_score = pearson_spearman(neg_sent_lst,reg_lst)
print(pearson_score, spearman_score)

pearson_score, spearman_score = pearson_spearman(obj_sent_lst,reg_lst)
print(pearson_score, spearman_score)

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
34734
34734
34734
34734
-0.2818702607976766 -0.32426284702881686
-0.21378822120971336 -0.22595147032822288
0.34561509827226455 0.40768423496969813


d) avg positive sentiment score

e) avg negative sentiment score

f) avg objective sentiment score

In [35]:
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')
pos_sent_lst = []
neg_sent_lst = []
obj_sent_lst = []
reg_lst = []

for word, reg_score in dataset:
    if(len(word.split(' ')) == 1):
        ct = 0
        avg_pos_score = 0
        avg_neg_score = 0
        avg_obj_score = 0
        for j in list(swn.senti_synsets(word)):
            avg_pos_score += j.pos_score()
            avg_neg_score += j.neg_score()
            avg_obj_score += j.obj_score()
            ct += 1

        if(ct>0):
            pos_sent_lst.append(avg_pos_score/ct)
            neg_sent_lst.append(avg_neg_score/ct)
            obj_sent_lst.append(avg_obj_score/ct)
            reg_lst.append(reg_score)
        else:
            pos_sent_lst.append(0)
            neg_sent_lst.append(0)
            obj_sent_lst.append(0)
            reg_lst.append(reg_score)

print(len(pos_sent_lst))
print(len(neg_sent_lst))
print(len(obj_sent_lst))
print(len(reg_lst))

pearson_score, spearman_score = pearson_spearman(pos_sent_lst,reg_lst)
print(pearson_score, spearman_score)

pearson_score, spearman_score = pearson_spearman(neg_sent_lst,reg_lst)
print(pearson_score, spearman_score)

pearson_score, spearman_score = pearson_spearman(obj_sent_lst,reg_lst)
print(pearson_score, spearman_score)

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
34734
34734
34734
34734
-0.312493992750145 -0.30937108118894124
-0.23308119767345495 -0.20032193624273656
0.3578723304957985 0.4309309436988021


### STAT \#7: Word Frequency in a Corpus

Corpus Link: https://www.cs.upc.edu/~nlp/wikicorpus/

Other Corpora: https://nlpforhackers.io/corpora/

Ref: https://www.pythonprogramming.in/find-frequency-of-each-word-from-a-text-file-using-nltk.html

In [None]:
import glob
import nltk
from tqdm.auto import tqdm
nltk.download('punkt')
files = glob.glob('../data/corpus/raw.en/*')
print(len(files))
# consider first 10 files for now
files = files[:10]
fd = {}
for file in files:
    with open(file,encoding='latin-1') as f:
        f = f.read()
        for sent in tqdm(nltk.sent_tokenize(f.lower())):
            for word in nltk.word_tokenize(sent):
                try:
                    fd[word] = fd[word]+1
                except KeyError:
                    fd[word] = 1

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
164


HBox(children=(FloatProgress(value=0.0, max=192217.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=192348.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=196450.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=190403.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=194269.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=190892.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=196575.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=175674.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=187668.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188949.0), HTML(value='')))




In [None]:
freq_lst = []
reg_lst = []
for word, reg_score in dataset:
    if(len(word.split(' ')) == 1 and word in fd):
        freq_lst.append(fd[word])
        reg_lst.append(reg_score)

print(len(freq_lst))
pearson_score, spearman_score = pearson_spearman(freq_lst,reg_lst)

print(pearson_score, spearman_score)

# Hmmm?

28981
-0.02011273037890105 0.14911366780979496
