In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download("punkt")
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import re

[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def on_bad_line(values):
    return values[:7]

columns_mapping = {
    0: 'genre',
    1: 'filename',
    2: 'year',
    3: 'index',
    4: 'score',
    5: 'sentence1',
    6: 'sentence2'
}

# quotings 3 = csv.QUOTE_NONE
df = pd.read_csv('Dataset/sts-train.csv', sep="\t", on_bad_lines=on_bad_line, engine='python', header=None, encoding='utf-8', quoting=3).rename(columns=columns_mapping)
print(f'shape of the Dataframe {df.shape}')
df.head(3)

shape of the Dataframe (5749, 7)


Unnamed: 0,genre,filename,year,index,score,sentence1,sentence2
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...


In [3]:
sentences1 = df["sentence1"]
sentences2 = df["sentence2"]
raw_corpus = np.concatenate((sentences1, sentences2))

stop_words = stopwords.words('english')

In [4]:
def preprocess(sentence):
    result = re.sub(f'[{punctuation}]','',sentence).lower()
    result = re.sub('\W', ' ', result).split()
    return [w for w in result if w not in stop_words]

In [5]:
from gensim.models import Word2Vec

tokenized_corpus = [preprocess(sentence) for sentence in raw_corpus]
model = Word2Vec(tokenized_corpus, vector_size=60, min_count=10, window=15, negative=10)
print(len(model.wv))

1534


### Represent sentences as vector (of fixed length)

In [6]:
print(model.wv.most_similar('cat'))

[('looking', 0.9973329901695251), ('large', 0.9960376024246216), ('small', 0.9959952235221863), ('camera', 0.9958369135856628), ('runs', 0.9957623481750488), ('beach', 0.9955052733421326), ('grassy', 0.995479166507721), ('blue', 0.9951849579811096), ('water', 0.994867205619812), ('cow', 0.9948633313179016)]


In [7]:
import gensim.downloader as api

model_wiki = api.load("glove-wiki-gigaword-50")

In [8]:
print(model_wiki.most_similar('paris'))

[('prohertrib', 0.8611263036727905), ('france', 0.8025329113006592), ('brussels', 0.7796469926834106), ('amsterdam', 0.7769756317138672), ('french', 0.7736119031906128), ('vienna', 0.7394115328788757), ('london', 0.7294350266456604), ('berlin', 0.7261149287223816), ('rome', 0.7099411487579346), ('strasbourg', 0.7078796029090881)]


In [9]:
from unidecode import unidecode

def vectorize(docs, embedding_model, useSum=False):
    """
    docs are the tweets

    """

    # crea una matrice vuota con righe pari al numero di tweets e colonne pari alla dimensione dell'embedding
    vectors = np.zeros((len(docs),50))

    # cicla sui tweet
    for i in range(len(docs)):
        tokens = re.sub(f'[{punctuation}]', '', docs[i].lower()).split() # tokenizza
        tokens = [unidecode(t) for t in tokens if t not in stop_words]
        embeddings = [embedding_model.get_vector(token) for token in tokens if token in embedding_model] # embedding per ogni token
        if (len(embeddings) > 0): # unisce tutti gli embedding in base a useSum
            if (useSum): 
                vectors[i] = sum(embeddings)
            else:
                vectors[i] = np.mean(embeddings, axis=0)
    return vectors


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import minmax_scale

vectorized_corpus_sum = vectorize(docs=raw_corpus, embedding_model=model_wiki, useSum=True)
vectorized_corpus_mean = vectorize(docs=raw_corpus, embedding_model=model_wiki, useSum=False)
similarity_matrix_sum = cosine_similarity(vectorized_corpus_sum)
similarity_matrix_mean = cosine_similarity(vectorized_corpus_mean)
print(similarity_matrix_sum.shape)

# mappatura dei valori
similarity_matrix_sum = minmax_scale(similarity_matrix_sum, feature_range=(0, 5))
similarity_matrix_mean = minmax_scale(similarity_matrix_mean, feature_range=(0, 5))

(11498, 11498)


In [11]:
def most_similar(sentence_index):
    all_sentence_combinations = []
    for j in range(0, len(similarity_matrix_sum)):
            all_sentence_combinations.append([similarity_matrix_sum[sentence_index][j], sentence_index, j])
    all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
    print("Sentence: " + raw_corpus[sentence_index])
    print("Top-30 most similar pairs:")
    for score, sentence_index, j in all_sentence_combinations[0:30]:
        print("{} \t {:.4f}".format(raw_corpus[j], similarity_matrix_sum[sentence_index][j]))

most_similar(120)

Sentence: A woman is dancing.
Top-30 most similar pairs:
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing. 	 1.0000
A woman is dancing and singing with other women. 	 0.9674
A woman is dancing with other women. 	 0.9630
A man and woman dance. 	 0.9613
A man is dancing. 	 0.9583
A man is dancing. 	 0.9583
A  man is dancing. 	 0.9583
A man is dancing. 	 0.9583
A man is dancing. 	 0.9583
The man is dancing. 	 0.9583
The man is dancing. 	 0.9583
A man is dancing. 	 0.9583
A man is dancing. 	 0.9583
A man is dancing. 	 0.9583
A man is dancing. 	 0.9583
A man is dancing. 	 0.9583
A woman is dancing on a stage. 	 0.9573
A woman is dancing in a cage. 	 0.9563
The woman is singing. 	 0.9539
A woman is holding a dancing baby up. 	 0.9418


In [17]:
def check_similarity(i: int, j: int):
    sim_sum = similarity_matrix_sum[i][j]
    sim_mean = similarity_matrix_mean[i][j]
    print(f'{i}: {raw_corpus[i]}')
    print(f'{j}: {raw_corpus[j]}')
    print(f'similarity sum: {sim_sum}; mean: {sim_mean}')


i = 666
offset = int(len(raw_corpus) / 2)
j = i + offset

check_similarity(i, j)

scores_true = np.ravel(df['score'].values)
print(f'true score is {scores_true[i]}')

666: The lady put the baby int a trashcan.
6415: A woman puts a baby in a trash can.
similarity sum: 3.9717728932007503; mean: 3.9717728961387406
true score is 5.0


In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
print(tokenizer.tokenize("The unemployment rate is predicted to have ticked up a percentage point to 6.1%"))

  from .autonotebook import tqdm as notebook_tqdm


['The', 'Ġunemployment', 'Ġrate', 'Ġis', 'Ġpredicted', 'Ġto', 'Ġhave', 'Ġtick', 'ed', 'Ġup', 'Ġa', 'Ġpercentage', 'Ġpoint', 'Ġto', 'Ġ6', '.', '1', '%']
