In [1]:
#import sys
#!conda install -c conda-forge gdown
import numpy as np
import scipy, requests, codecs, os, re, nltk, itertools, csv
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from scipy.stats import spearmanr
import pandas as pd
import functools as ft
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
#import gdown
import codecs

In [2]:
def load_embeddings_from_np(filename):
    print('loading ...')
    with codecs.open(filename + '.vocab', 'r', 'utf-8') as f_embed:
        vocab = [line.strip() for line in f_embed]
        
    w2i = {w: i for i, w in enumerate(vocab)}
    wv = np.load(filename + '.wv.npy')

    return vocab, wv, w2i

def load_dhdglove(path):
    print('loading ...')
    debiased_embeds = pickle.load(open(path, 'rb'))
    wv = []
    vocab = []
    for w in debiased_embeds:
        wv.append(np.array(debiased_embeds[w]))
        vocab.append(str(w))
        
    w2i = {w: i for i, w in enumerate(vocab)}
    wv = np.array(wv).astype(float)
    print(len(vocab), wv.shape, len(w2i))
        
    return vocab, wv, w2i 

def load_wo_normalize(space, filename, vocab, wv, w2i):
    if filename[-3:]=='txt':
        vocab_muse, wv_muse, w2i_muse = load_embeddings_from_np(filename)
    else:
        vocab_muse, wv_muse, w2i_muse = load_dhdglove(filename)
    vocab[space] = vocab_muse 
    wv[space] = wv_muse
    w2i[space] = w2i_muse
    print('done')

In [3]:
vocab = {}
wv = {}
w2i = {}

load_wo_normalize('bef', 'Gender-Biased Word Relation Task/data/embeddings/glove_wiki_vectors.txt', vocab, wv, w2i)
# load_wo_normalize('aft', 'Gender-Biased Word Relation Task/data/embeddings/hsrglove_wiki_vectors.txt', vocab, wv, w2i)
load_wo_normalize('aft', 'Gender-Biased Word Relation Task/hsr_ran_glove.txt', vocab, wv, w2i)

loading ...
done
loading ...
done


In [4]:
orig_glove = dict(zip(vocab['bef'], wv['bef']))
post_glove = dict(zip(vocab['aft'], wv['aft']))

In [5]:
resourceFile = 'data/' 
def load_sts_dataset(filename):
    # For a STS dataset, loads the relevant information: the sentences and their human rated similarity score.
    sent_pairs = []
    with tf.io.gfile.GFile(filename, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            if len(ts) == 7 or len(ts) == 9:
                sent_pairs.append((re.sub("[^0-9]", "", ts[2]) + '-' + ts[1] , ts[5], ts[6], float(ts[4])))
            elif len(ts) == 6 or len(ts) == 8:
                sent_pairs.append((re.sub("[^0-9]", "", ts[1]) + '-' + ts[0] , ts[4], ts[5], float(ts[3])))
            else:
                print('data format is wrong!!!')
    return pd.DataFrame(sent_pairs, columns=["year-task", "sent_1", "sent_2", "sim"])


def load_all_sts_dataset():
    # Loads all of the STS datasets 
    stsbenchmarkDir = resourceFile + 'stsbenchmark/'
    stscompanionDir = resourceFile + 'stsbenchmark/'
    sts_train = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-train.csv"))    
    sts_dev = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-dev.csv"))
    sts_test = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-test.csv"))
    sts_other = load_sts_dataset(os.path.join(stscompanionDir, "sts-other.csv"))
    sts_mt = load_sts_dataset(os.path.join(stscompanionDir, "sts-mt.csv"))
    
    sts_all = pd.concat([sts_train, sts_dev, sts_test, sts_other, sts_mt ])
    
    return sts_all

sts_all = load_all_sts_dataset()





def load_sts_by_year_task():
    # Divide STS datasets based on their year and tasks
    sts_by_year_task = {}
    
    for year_task in sts_all['year-task'].unique():
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x == year_task]
        
        pairs = sts_all.iloc[indices]
        
        sts_by_year_task[year_task] = pairs
        
    return sts_by_year_task

sts_by_year_task = load_sts_by_year_task()




def load_sts_by_year():
    # Divide STS datasets ONLY based on their year (different tasks in that year are merged).

    sts_by_year = {}
    
    for year in ['2012', '2013', '2014', '2015', '2016', '2017']:
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x.startswith(year)]
        
        pairs = sts_all.iloc[indices]
        pairs = pairs.copy()
        pairs['year-task'] = year
        sts_by_year[year] = pairs
        
    return sts_by_year

sts_by_year_task = load_sts_by_year_task()

sts_by_year = load_sts_by_year()


filename = resourceFile + '2015-answers-students.test.tsv'
sent_pairs = []
with tf.io.gfile.GFile(filename, "r") as f:
    for line in f:
        ts = line.strip().split("\t")
        if len(ts) == 3:
            sent_pairs.append((ts[1], ts[2], float(ts[0])))
answers_students_2015 =  pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])


# show some sample sts data    
sts_all[:5]

Unnamed: 0,year-task,sent_1,sent_2,sim
0,2012-MSRvid,A plane is taking off.,An air plane is taking off.,5.0
1,2012-MSRvid,A man is playing a large flute.,A man is playing a flute.,3.8
2,2012-MSRvid,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,2012-MSRvid,Three men are playing chess.,Two men are playing chess.,2.6
4,2012-MSRvid,A man is playing the cello.,A man seated is playing the cello.,4.25


In [6]:
def download_sick(f): 

    response = requests.get(f).text

    lines = response.split("\n")[1:]
    lines = [l.split("\t") for l in lines if len(l) > 0]
    lines = [l for l in lines if len(l) == 5]

    df = pd.DataFrame(lines, columns=["idx", "sent_1", "sent_2", "sim", "label"])
    df['sim'] = pd.to_numeric(df['sim'])
    return df
    
sick_all = download_sick("https://raw.githubusercontent.com/alvations/stasis/master/SICK-data/SICK_test_annotated.txt")

sick_all[:5]

Unnamed: 0,idx,sent_1,sent_2,sim,label
0,6,There is no boy playing outdoors and there is ...,A group of kids is playing in a yard and an ol...,3.3,NEUTRAL\r
1,7,A group of boys in a yard is playing and a man...,The young boys are playing outdoors and the ma...,3.7,NEUTRAL\r
2,8,A group of children is playing in the house an...,The young boys are playing outdoors and the ma...,3.0,NEUTRAL\r
3,10,A brown dog is attacking another animal in fro...,A brown dog is attacking another animal in fro...,4.9,ENTAILMENT\r
4,11,A brown dog is attacking another animal in fro...,A brown dog is helping another animal in front...,3.665,NEUTRAL\r


In [7]:
class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        
def run_benchmark(sentences1, sentences2, model_str): 
    
    model = eval(model_str)
    embeddings = []
    
    wv_len = 300
    
    if 'bert' in model_str:
        wv_len = 768
        
    for (sent1, sent2) in zip(sentences1, sentences2): 

        tokens1 =  sent1.tokens
        tokens2 =  sent2.tokens

        tokens1 = [token for token in tokens1 if token in model and token.islower()]
        tokens2 = [token for token in tokens2 if token in model and token.islower()]
        
        if tokens1 == [] and tokens2 != []:
            embedding1 = np.zeros(wv_len)
            embedding2 = np.average([model[token] for token in tokens2], axis=0)
        elif tokens2 == [] and tokens1 != []:
            embedding2 = np.zeros(wv_len)
            embedding1 = np.average([model[token] for token in tokens1], axis=0)
        elif tokens2 != [] and tokens1 != []:     
            embedding1 = np.average([model[token] for token in tokens1], axis=0)
            embedding2 = np.average([model[token] for token in tokens2], axis=0)
        else:
            embedding1 = np.zeros(wv_len)
            embedding2 = np.zeros(wv_len)

#         if isinstance(embedding1, float) or isinstance(embedding2, float):
#             embeddings.append(np.zeros(300))
#             embeddings.append(np.zeros(300))
#         else:
#             embeddings.append(embedding1)
#             embeddings.append(embedding2)
        embeddings.append(embedding1)
        embeddings.append(embedding2)


    sims = [cosine_similarity(embeddings[idx*2].reshape(1, -1), embeddings[idx*2+1].reshape(1, -1))[0][0] for idx in range(int(len(embeddings)/2))]
    return sims

def run_experiment(df, benchmarks): 
    
    sentences1 = [Sentence(s) for s in df['sent_1']]
    sentences2 = [Sentence(s) for s in df['sent_2']]
    
    pearson_cors, spearman_cors = [], []
    for label, method in benchmarks:
        sims = method(sentences1, sentences2)
        pearson_correlation = round(scipy.stats.pearsonr(sims, df['sim'])[0] * 100,2)
        #print(label, pearson_correlation)
        pearson_cors.append(pearson_correlation)
        
    return pearson_cors

In [8]:
import nltk
nltk.download('punkt')
benchmarks = [
     ("orig-glove", ft.partial(run_benchmark, model_str= 'orig_glove')),
    ("HSR-glove", ft.partial(run_benchmark, model_str= 'post_glove'))]

pearson_results_year_task = {}

for year_task in sts_all['year-task'].unique():
    print('STS-' + year_task)
    pearson_results_year_task['STS-' + year_task] = run_experiment(sts_by_year_task[year_task], benchmarks)  
    
pearson_results_year_task['SICK'] = run_experiment(sick_all, benchmarks) 
pearson_results_year_task['2015-answers_students'] = run_experiment(answers_students_2015, benchmarks)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\biagi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


STS-2012-MSRvid
STS-2014-images
STS-2015-images
STS-2014-deft-forum
STS-2012-MSRpar
STS-2014-deft-news
STS-2013-headlines
STS-2014-headlines
STS-2015-headlines
STS-2016-headlines
STS-2017-track5.en-en
STS-2015-answers-forums
STS-2016-answer-answer
STS-2012-surprise.OnWN
STS-2013-FNWN
STS-2013-OnWN
STS-2014-OnWN
STS-2014-tweet-news
STS-2015-belief
STS-2016-plagiarism
STS-2016-question-question
STS-2012-SMTeuroparl
STS-2012-surprise.SMTnews
STS-2016-postediting


In [9]:
pearson_results_year_task_df = pd.DataFrame(pearson_results_year_task)
pearson_results_year_task_df = pearson_results_year_task_df.transpose()
pearson_results_year_task_df = pearson_results_year_task_df.rename(columns={i:b[0] for i, b in enumerate(benchmarks)})

pearson_results_year_task_df=pearson_results_year_task_df.reindex(['STS-2012-MSRpar', 'STS-2012-MSRvid', 'STS-2012-surprise.OnWN', 'STS-2012-SMTeuroparl', 'STS-2012-surprise.SMTnews','STS-2013-FNWN', 'STS-2013-OnWN', 'STS-2013-headlines',  'STS-2014-OnWN', 'STS-2014-deft-forum','STS-2014-deft-news', 'STS-2014-headlines', 'STS-2014-tweet-news',  'STS-2014-images', 'STS-2015-answers-forums', '2015-answers_students', 'STS-2015-belief',  'STS-2015-headlines', 'STS-2015-images', 'SICK'])

In [10]:
pearson_results_year_task_df.to_csv('hsr_ran.csv')

In [10]:
# pearson_results_year_task_df.to_csv('hsr.csv')

In [19]:
# orig_2012 = np.mean(pearson_results_year_task_df.iloc[:5, 0])
# orig_2013 = np.mean(pearson_results_year_task_df.iloc[5:8, 0])
# orig_2014 = np.mean(pearson_results_year_task_df.iloc[8:14, 0])
# orig_2015 = np.mean(pearson_results_year_task_df.iloc[14:19, 0])
# orig_SICK = np.mean(pearson_results_year_task_df.iloc[19, 0])
# HSR_2012 = np.mean(pearson_results_year_task_df.iloc[:5, 1])
# HSR_2013 = np.mean(pearson_results_year_task_df.iloc[5:8, 1])
# HSR_2014 = np.mean(pearson_results_year_task_df.iloc[8:14, 1])
# HSR_2015 = np.mean(pearson_results_year_task_df.iloc[14:19, 1])
# HSR_SICK = np.mean(pearson_results_year_task_df.iloc[19, 1])

In [11]:
orig_2012 = np.mean(pearson_results_year_task_df.iloc[:5, 0])
orig_2013 = np.mean(pearson_results_year_task_df.iloc[5:8, 0])
orig_2014 = np.mean(pearson_results_year_task_df.iloc[8:14, 0])
orig_2015 = np.mean(pearson_results_year_task_df.iloc[14:19, 0])
orig_SICK = np.mean(pearson_results_year_task_df.iloc[19, 0])
HSRRAN_2012 = np.mean(pearson_results_year_task_df.iloc[:5, 1])
HSRRAN_2013 = np.mean(pearson_results_year_task_df.iloc[5:8, 1])
HSRRAN_2014 = np.mean(pearson_results_year_task_df.iloc[8:14, 1])
HSRRAN_2015 = np.mean(pearson_results_year_task_df.iloc[14:19, 1])
HSRRAN_SICK = np.mean(pearson_results_year_task_df.iloc[19, 1])

In [14]:
# results_STS = {'orig_glove':[orig_2012, orig_2013, orig_2014, orig_2015, orig_SICK], 'HSR_glove':[HSR_2012, HSR_2013, HSR_2014, HSR_2015, HSR_SICK]}
results_STS = {'orig_glove':[orig_2012, orig_2013, orig_2014, orig_2015, orig_SICK], 'HSR-RAN_glove':[HSRRAN_2012, HSRRAN_2013, HSRRAN_2014, HSRRAN_2015, HSRRAN_SICK]}
results_STS = pd.DataFrame(results_STS)
results_STS.index = ['2012', '2013', '2014', '2015', 'SICK']
results_STS

Unnamed: 0,orig_glove,HSR-RAN_glove
2012,48.918,49.96
2013,46.896667,49.673333
2014,51.023333,53.871667
2015,51.35,53.418
SICK,62.11,61.98
