In [7]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from nltk.tokenize import word_tokenize

from string import punctuation
from unidecode import unidecode
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import spacy
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-05-27 11:14:35.301871: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-27 11:14:54.385205: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [8]:
def on_bad_line(values):
    return values[:7]

columns_mapping = {
    0: 'genre',
    1: 'filename',
    2: 'year',
    3: 'index',
    4: 'score',
    5: 'sentence1',
    6: 'sentence2'
}

# quotings 3 = csv.QUOTE_NONE
train_df = pd.read_csv('Dataset/sts-train.csv', sep="\t", on_bad_lines=on_bad_line, engine='python', header=None, encoding='utf-8', quoting=3).rename(columns=columns_mapping)
print(f'shape of the Dataframe {train_df.shape}')
train_df.head(10)

shape of the Dataframe (5749, 7)


Unnamed: 0,genre,filename,year,index,score,sentence1,sentence2
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.
5,main-captions,MSRvid,2012test,11,4.25,Some men are fighting.,Two men are fighting.
6,main-captions,MSRvid,2012test,12,0.5,A man is smoking.,A man is skating.
7,main-captions,MSRvid,2012test,13,1.6,The man is playing the piano.,The man is playing the guitar.
8,main-captions,MSRvid,2012test,14,2.2,A man is playing on a guitar and singing.,A woman is playing an acoustic guitar and sing...
9,main-captions,MSRvid,2012test,16,5.0,A person is throwing a cat on to the ceiling.,A person throws a cat on the ceiling.


In [9]:
test_df = pd.read_csv('Dataset/sts-test.csv', sep="\t", on_bad_lines=on_bad_line, engine='python', header=None, encoding='utf-8', quoting=3).rename(columns=columns_mapping)
print(f'shape of the Dataframe {train_df.shape}')

shape of the Dataframe (5749, 7)


In [10]:
def vectorize(sentences, vocab, vectorizer, vector_size=100, tokenizer=lambda x: x.split()):
    # create a matrix with rows equal to the number of tweets and columns equal to the embedding size
    vectors = np.zeros((len(sentences), vector_size))

    # for each sentence in the dataset compute the embedding 
    for i, sentence in enumerate(sentences):
        tokens = tokenizer(sentence)
        embeddings = [vectorizer(token) for token in tokens if token in vocab] # embedding per ogni token
        if (len(embeddings) > 0): # unisce tutti gli embedding in base a useSum
            vectors[i] = np.mean(embeddings, axis=0)
    return vectors

In [11]:
stop_words = stopwords.words('english')

def preprocess_and_tokenize(sentence: str):
    temp = ' '.join([word.lemma_.lower() for word in spacy_nlp(sentence) if word.lemma_.lower() not in stop_words])
    temp = re.sub(f'[{punctuation}]', '', unidecode(temp))
    #temp = re.sub(r'\d+(\.\d+)?', 'number', temp)
    return word_tokenize(temp)

In [12]:
sentences1 = train_df["sentence1"]
sentences2 = train_df["sentence2"]
sentences_list = np.unique(np.concatenate((np.ravel(sentences1.values),np.ravel(sentences2.values))))

print(f"Total number of sentences: {len(sentences_list)}")
tokenized_sentences = [preprocess_and_tokenize(w) for w in sentences_list]

Total number of sentences: 10566


In [13]:

model = Word2Vec(tokenized_sentences, vector_size=50, min_count=4, window=5, negative=10, workers=4, sg=0, cbow_mean=1, epochs=150, seed=42)
print("vocabular size:", len(model.wv.key_to_index))

#print most similar words to
for item in ['car', 'boy', 'man', 'girl', 'woman', 'obama', 'florida', 'cat', 'dog', 'blue', 'kill', 'bomb', 'piano', 'pizza', 'italy']:
    print(item, '-->', [w for w, s in model.wv.most_similar(item)])

vocabular size: 3159
car --> ['motorcycle', 'bicycle', 'truck', '18', 'silver', 'desert', 'sidewalk', 'bus', 'vehicle', 'explode']
boy --> ['kid', 'girl', 'child', 'woman', 'man', 'shirt', 'surfer', 'ball', 'dog', 'person']
man --> ['woman', 'girl', 'person', 'suit', 'guy', 'boy', 'child', 'kid', 'dog', 'animal']
girl --> ['woman', 'man', 'boy', 'couple', 'kid', 'pant', 'blue', 'child', 'jacket', 'white']
woman --> ['man', 'girl', 'boy', 'person', 'lady', 'someone', 'suit', 'guy', 'short', 'baby']
obama --> ['rebuild', 'defend', 'let', 'decision', 'want', 'ann', 'cameron', 'kerry', 'vow', 'haiti']
florida --> ['speaker', 'convention', 'outfielder', 'bob', 'hospital', 'voting', 'premier', 'cuba', 'kennedy', 'passenger']
cat --> ['kitten', 'blanket', 'dog', 'leather', 'animal', 'couch', 'suit', 'panda', 'grey', 'laptop']
dog --> ['animal', 'puppy', 'mouth', 'ball', 'kitten', 'cat', 'panda', 'snow', 'cow', 'man']
blue --> ['pink', 'green', 'orange', 'yellow', 'gray', 'red', 'black', 'girl

In [14]:
temp = []
for a, b in zip(train_df["sentence1"], train_df["sentence2"]):
    temp.append(np.ravel(vectorize([a, b], model.wv.key_to_index, model.wv.get_vector, tokenizer=preprocess_and_tokenize, vector_size=50)))
    temp.append(np.ravel(vectorize([b, a], model.wv.key_to_index, model.wv.get_vector, tokenizer=preprocess_and_tokenize, vector_size=50)))
X_train = np.array(temp)
y_train = np.repeat(train_df["score"].values / 5, 2)


temp = []
for a, b in zip(test_df["sentence1"], test_df["sentence2"]):
    temp.append(np.ravel(vectorize([a, b], model.wv.key_to_index, model.wv.get_vector, tokenizer=preprocess_and_tokenize, vector_size=50)))
    temp.append(np.ravel(vectorize([b, a], model.wv.key_to_index, model.wv.get_vector, tokenizer=preprocess_and_tokenize, vector_size=50)))
X_test = np.array(temp)
y_test = np.repeat(test_df["score"].values / 5, 2)

In [15]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (11498, 100)
y_train shape: (11498,)
X_test shape: (2758, 100)
y_test shape: (2758,)


In [16]:
def train_and_test(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
    print(f'Pearson: {pearsonr(y_test, y_pred)}')
    print(f'Spearman: {spearmanr(y_test, y_pred)}')
    print('**********')

In [17]:
print('RandomForestRegressor')
train_and_test(RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))

RandomForestRegressor
MSE: 0.07412351184580035
MAE: 0.22826399473988368
Pearson: PearsonRResult(statistic=0.484974381597237, pvalue=1.0577702560060894e-162)
Spearman: SignificanceResult(statistic=0.44503930574870004, pvalue=2.7557831515175414e-134)
**********


In [18]:
import gensim.downloader as api

model_wiki = api.load("glove-wiki-gigaword-50")

In [19]:
temp = []
for a, b in zip(train_df["sentence1"], train_df["sentence2"]):
    temp.append(np.ravel(vectorize([a, b], model_wiki.key_to_index, model_wiki.get_vector, tokenizer=preprocess_and_tokenize, vector_size=50)))
    temp.append(np.ravel(vectorize([b, a], model_wiki.key_to_index, model_wiki.get_vector, tokenizer=preprocess_and_tokenize, vector_size=50)))
X_train = np.array(temp)
y_train = np.repeat(train_df["score"].values / 5, 2)


temp = []
for a, b in zip(test_df["sentence1"], test_df["sentence2"]):
    temp.append(np.ravel(vectorize([a, b], model_wiki.key_to_index, model_wiki.get_vector, tokenizer=preprocess_and_tokenize, vector_size=50)))
    temp.append(np.ravel(vectorize([b, a], model_wiki.key_to_index, model_wiki.get_vector, tokenizer=preprocess_and_tokenize, vector_size=50)))
X_test = np.array(temp)
y_test = np.repeat(test_df["score"].values / 5, 2)

In [20]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (11498, 100)
y_train shape: (11498,)
X_test shape: (2758, 100)
y_test shape: (2758,)


In [21]:
def train_and_test(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
    print(f'Pearson: {pearsonr(y_test, y_pred)}')
    print(f'Spearman: {spearmanr(y_test, y_pred)}')
    print('**********')

In [22]:
print('RandomForestRegressor')
train_and_test(RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))

RandomForestRegressor
MSE: 0.07175080290579616
MAE: 0.22539558337626156
Pearson: PearsonRResult(statistic=0.5247575093596497, pvalue=5.026872589242806e-195)
Spearman: SignificanceResult(statistic=0.4938608129421847, pvalue=1.4826716327701897e-169)
**********
