In [12]:
!pip install numpy pandas matplotlib nltk fasttext-wheel gensim unidecode



In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download("punkt")
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def on_bad_line(values):
    return values[:7]

columns_mapping = {
    0: 'genre',
    1: 'filename',
    2: 'year',
    3: 'index',
    4: 'score',
    5: 'sentence1',
    6: 'sentence2'
}

# quotings 3 = csv.QUOTE_NONE
df = pd.read_csv('Dataset/sts-train.csv', sep="\t", on_bad_lines=on_bad_line, engine='python', header=None, encoding='utf-8', quoting=3).rename(columns=columns_mapping)
print(f'shape of the Dataframe {df.shape}')
df.head(3)

shape of the Dataframe (5749, 7)


Unnamed: 0,genre,filename,year,index,score,sentence1,sentence2
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...


In [15]:
sentences1 = df["sentence1"]
sentences2 = df["sentence2"]
raw_corpus = np.concatenate((sentences1, sentences2))

stop_words = stopwords.words('english')

In [16]:
def preprocess(sentence):
    result = re.sub(f'[{punctuation}]','',sentence).lower()
    result = re.sub('\W', ' ', result).split()
    return [w for w in result if w not in stop_words]

In [17]:
from gensim.models import Word2Vec

tokenized_corpus = [preprocess(sentence) for sentence in raw_corpus]
model = Word2Vec(tokenized_corpus, vector_size=60, min_count=10, window=15, negative=10)
print(len(model.wv))

1534


### Represent sentences as vector (of fixed length)

In [18]:
print(model.wv.most_similar('cat'))

[('runs', 0.9956508874893188), ('small', 0.9955808520317078), ('water', 0.9954851269721985), ('cow', 0.9954419732093811), ('looking', 0.9950761198997498), ('walking', 0.9946889281272888), ('grassy', 0.9937186241149902), ('blue', 0.993364155292511), ('camera', 0.9933255314826965), ('mouth', 0.9930509328842163)]


In [19]:
import gensim.downloader as api

model_wiki = api.load("glove-wiki-gigaword-300")

In [20]:
print(model_wiki.most_similar('paris'))

[('prohertrib', 0.8032476902008057), ('france', 0.6580671668052673), ('french', 0.5986923575401306), ('brussels', 0.5912898778915405), ('london', 0.5776094198226929), ('strasbourg', 0.528137743473053), ('parisian', 0.5212979912757874), ('rome', 0.5149272084236145), ('berlin', 0.5125114917755127), ('frankfurt', 0.5043601393699646)]


In [24]:
from unidecode import unidecode

def vectorize(docs, embedding_model, useSum=False, vector_size=300):
    """
    docs are the tweets

    """

    # crea una matrice vuota con righe pari al numero di tweets e colonne pari alla dimensione dell'embedding
    vectors = np.zeros((len(docs),vector_size))

    # cicla sui tweet
    for i in range(len(docs)):
        tokens = re.sub(f'[{punctuation}]', '', docs[i].lower()).split() # tokenizza
        tokens = [unidecode(t) for t in tokens if t not in stop_words]
        embeddings = [embedding_model.get_vector(token) for token in tokens if token in embedding_model] # embedding per ogni token
        if (len(embeddings) > 0): # unisce tutti gli embedding in base a useSum
            if (useSum): 
                vectors[i] = sum(embeddings)
            else:
                vectors[i] = np.mean(embeddings, axis=0)
    return vectors


In [40]:
def most_similar(sentence_index):
    all_sentence_combinations = []
    for j in range(0, len(similarity_matrix_sum)):
            all_sentence_combinations.append([similarity_matrix_sum[sentence_index][j], sentence_index, j])
    all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
    print("Sentence: " + set_raw_corpus[sentence_index])
    print("Top-30 most similar pairs:")
    for score, sentence_index, j in all_sentence_combinations[:100]:
        print("{} \t {:.4f}".format(set_raw_corpus[j], similarity_matrix_sum[sentence_index][j]))

most_similar(100)

Sentence: A girl in a pink dress rides her bike along a white picket fence.
Top-30 most similar pairs:
A man is standing in front of a raised statue,. 	 3.3179
A young boy is twisting a sparkler around in the air. 	 3.3170
The young boy is jumping into the pool on his belly. 	 3.3165
A man is watching TV on the foot of his bed. 	 3.3158
Her body was found several weeks later in the Green River. 	 3.3146
A man shows off a small monkey. 	 3.3143
A small monkey walks through water. 	 3.3142
A man sits on a rock next to a folding deck chair and a fishing pole. 	 3.3141
A dog jumping over an obstacle course fence. 	 3.3133
Two cows cross a paved street. 	 3.3123


In [34]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import minmax_scale

set_raw_corpus = list(set(raw_corpus))

vectorized_corpus_sum = vectorize(docs=set_raw_corpus, embedding_model=model_wiki, useSum=True)
vectorized_corpus_mean = vectorize(docs=set_raw_corpus, embedding_model=model_wiki, useSum=False)
similarity_matrix_sum = cosine_similarity(vectorized_corpus_sum)
similarity_matrix_mean = cosine_similarity(vectorized_corpus_mean)
print(similarity_matrix_sum.shape)

# mappatura dei valori
similarity_matrix_sum = minmax_scale(similarity_matrix_sum, feature_range=(0, 5))
similarity_matrix_mean = minmax_scale(similarity_matrix_mean, feature_range=(0, 5))

(10566, 10566)


In [14]:
def check_similarity(i: int, j: int):
    sim_sum = similarity_matrix_sum[i][j]
    sim_mean = similarity_matrix_mean[i][j]
    print(f'{i}: {raw_corpus[i]}')
    print(f'{j}: {raw_corpus[j]}')
    print(f'similarity sum: {sim_sum}; mean: {sim_mean}')


i = 666
offset = int(len(raw_corpus) / 2)
j = i + offset

check_similarity(i, j)

scores_true = np.ravel(df['score'].values)
print(f'true score is {scores_true[i]}')

666: The lady put the baby int a trashcan.
6415: A woman puts a baby in a trash can.
similarity sum: 3.9717728932007503; mean: 3.9717728961387406
true score is 5.0


### FastText - sub-word embeddings

In [15]:
no_punct_lower_corpus = []
for sentence in raw_corpus:
    sentence = re.sub(f'[{punctuation}]','',sentence).lower()
    sentence = re.sub(f'[{stop_words}]','',sentence)
    no_stopwords_list = []
    for word in sentence.split(' '):
        if word not in stop_words:
            no_stopwords_list.append(word)
    sentence = ' '.join(no_stopwords_list)
    no_punct_lower_corpus.append(sentence)
no_punct_lower_corpus

  sentence = re.sub(f'[{stop_words}]','',sentence)


['plane taking',
 'man playing large flute',
 'man spreading shreded cheese pizza',
 'three men playing chess',
 'man playing cello',
 'men fighting',
 'man smoking',
 'man playing piano',
 'man playing guitar singing',
 'person throwing cat ceiling',
 'man hit man stick',
 'woman picks holds baby kangaroo',
 'man playing flute',
 'person folding piece paper',
 'man running road',
 'dog trying get bacon back',
 'polar bear sliding snow',
 'woman writing',
 'cat rubbing babys face',
 'man riding horse',
 'man pours oil pot',
 'man playing guitar',
 'panda sliding slide',
 'woman eating something',
 'woman peels potato',
 'boy fell bike',
 'woman playing flute',
 'rabbit running eagle',
 'woman frying breaded pork chop',
 'girl flying kite',
 'man riding mechanical bull',
 'man playing guitar',
 'woman dancing singing women',
 'man slicing bun',
 'man pouring oil pan',
 'lion playing people',
 'dog rides skateboard',
 'someone carving statue',
 'woman slicing onion',
 'woman peels shrimp

In [16]:
import fasttext


with open('./data.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(set(no_punct_lower_corpus)))

ft_skip_model = fasttext.train_unsupervised('data.txt', model='skipgram', dim=30, minCount=5,minn=3, maxn=10, ws=10)
ft_model = fasttext.load_model('./cc.en.300.bin')



In [6]:
from gensim.models import FastText 
from gensim.models.fasttext import load_facebook_model

model = load_facebook_model('./cc.en.300.bin')

AttributeError: 'FastText' object has no attribute 'most_similar'

In [18]:
print(model.wv.most_similar('chicen'))
# Output = [('headteacher', 0.8075869083404541), ('schoolteacher', 0.7955552339553833), ('teachers', 0.733420729637146), ('teaches', 0.6839243173599243), ('meacher', 0.6825737357139587), ('teach', 0.6285147070884705), ('taught', 0.6244685649871826), ('teaching', 0.6199781894683838), ('schoolmaster', 0.6037642955780029), ('lessons', 0.5812176465988159)]

print(model.wv.similarity('teacher', 'teaches'))
# Output = 0.683924396754

[('chicenvelopements', 0.4541935622692108), ('QQFZAAEACwAAAAAGQASAAAIjgAJCBQIoGDBgQgTKiwooGHDgwshDgTgsOLDhAAGaAQwUYBBhx85EtS4cWLGjR5JSjxZkgDFkwwLohTJUqTLlANiwvQ4seVNjwwfBoVokKjFo0Jlksz506NFiklZtoQKFSjIoktLVv1YsahSn1WP0vzq02VYoAjJMsVYVKHZrDbdupW6Vq5cunHtRjQoMCAAIfkECRQABAAsCQADAAQABAAACAsABQgkILCgwYEBAQAh', 0.4225360155105591), ('crescendosexibloguerobateyabsorbersexiindesignabledinerolatifundiosexibrezarcularsutesexirapoplinbrezarcorrentosoVd.lazadareflejoreglafeministabrezarchuzasexiouttiqueblogueroin', 0.4138853847980499), ('CrônicasEsdrasNeemiasEsterJóSalmosProvérbiosEclesiastesCânticosIsaíasJeremiasLamentaçõesEzequielDanielOséiasJoelAmósObadiasJonasMiquéiasNaumHabacuqueSofoniasAgeuZacariasMalaquiasNovo', 0.405648797750473), ('PyshmaRevdaBeryozovskySredneuralsk', 0.4026032090187073), ('deblogueroreflejoantecedentesexitlacuachebateysuteindesignableabsorbersexilatifundiosexibrezarsutemultiétnicosexiplinrapobrezarcorrentosoVd.lazadafisiochillidomabrezarsico-chuzaoutcolodrablogueroin', 