In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download("punkt")
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import re

[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def on_bad_line(values):
    return values[:7]

columns_mapping = {
    0: 'genre',
    1: 'filename',
    2: 'year',
    3: 'index',
    4: 'score',
    5: 'sentence1',
    6: 'sentence2'
}

# quotings 3 = csv.QUOTE_NONE
df = pd.read_csv('Dataset/sts-train.csv', sep="\t", on_bad_lines=on_bad_line, engine='python', header=None, encoding='utf-8', quoting=3).rename(columns=columns_mapping)
print(f'shape of the Dataframe {df.shape}')
df.head(10)

shape of the Dataframe (5749, 7)


Unnamed: 0,genre,filename,year,index,score,sentence1,sentence2
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.
5,main-captions,MSRvid,2012test,11,4.25,Some men are fighting.,Two men are fighting.
6,main-captions,MSRvid,2012test,12,0.5,A man is smoking.,A man is skating.
7,main-captions,MSRvid,2012test,13,1.6,The man is playing the piano.,The man is playing the guitar.
8,main-captions,MSRvid,2012test,14,2.2,A man is playing on a guitar and singing.,A woman is playing an acoustic guitar and sing...
9,main-captions,MSRvid,2012test,16,5.0,A person is throwing a cat on to the ceiling.,A person throws a cat on the ceiling.


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage
from tqdm import tqdm
import scipy.sparse as scs

In [4]:
sentences1 = df["sentence1"]
sentences2 = df["sentence2"]
raw_corpus = np.concatenate((sentences1, sentences2))

stop_words = stopwords.words('english')

regex = f'[{punctuation}]'
corpus = np.array([re.sub(regex,'',string).lower() for string in raw_corpus])

# creazione della matrice di similarità
tfidf = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english', strip_accents='ascii')
tfidf_matrix = tfidf.fit_transform(corpus)
similarity_matrix = cosine_similarity(tfidf_matrix)
vocab = tfidf.get_feature_names_out()

print(tfidf_matrix.shape)

(11498, 2877)


In [5]:
# Get the terms (columns) of the matrix
terms = tfidf.get_feature_names_out()
print(terms)

word = 'dog'
word_id = tfidf.transform([word]).nonzero()[1]
filter = tfidf_matrix.getcol(word_id).nonzero()[0]
print(f'the word {word} is present in {len(filter)} documents. Here\'s the first 5')
print(corpus[filter][:5])

['04' '07' '10' ... 'zimbabwe' 'zimmerman' 'zone']
the word dog is present in 433 documents. Here's the first 5
['a dog is trying to get bacon off his back' 'a dog rides a skateboard'
 'a dog licks a baby' 'a dog is eating water melon'
 'a small dog is chasing a yoga ball']


In [17]:
docs = ['the black cat is on the plane, near the white cat', 'another cat is waiting for its food', 'càt orange plane']
dt_matrix = tfidf.transform(docs)
print(dt_matrix)

  (0, 2805)	0.2952740316253059
  (0, 1902)	0.405078758431841
  (0, 1695)	0.3874740562788338
  (0, 467)	0.7129072478984504
  (0, 345)	0.30058535779332396
  (1, 2759)	0.6709395536042083
  (1, 1026)	0.5748842386096671
  (1, 467)	0.46834626891603315
  (2, 1902)	0.5759350456688125
  (2, 1794)	0.6414452697963486
  (2, 467)	0.50680054165949


In [18]:
k = 30
kmeans = KMeans(n_clusters=k, max_iter=100, n_init=2, verbose=True)
cluster_matrix = kmeans.fit_transform(tfidf_matrix)


Initialization complete
Iteration 0, inertia 19312.619734942713.
Iteration 1, inertia 10543.630351754517.
Iteration 2, inertia 10491.200139302586.
Iteration 3, inertia 10459.384151765547.
Iteration 4, inertia 10451.401064134441.
Iteration 5, inertia 10444.212212742817.
Iteration 6, inertia 10437.963965781455.
Iteration 7, inertia 10434.200905290123.
Iteration 8, inertia 10429.810610515426.
Iteration 9, inertia 10427.145002937279.
Iteration 10, inertia 10423.162686221849.
Iteration 11, inertia 10417.501258545039.
Iteration 12, inertia 10389.402654884878.
Iteration 13, inertia 10384.753471356591.
Iteration 14, inertia 10384.109599476968.
Iteration 15, inertia 10383.487712229798.
Iteration 16, inertia 10383.0420604167.
Iteration 17, inertia 10382.811282508505.
Iteration 18, inertia 10382.681314184527.
Iteration 19, inertia 10382.62037156886.
Iteration 20, inertia 10382.603029655944.
Iteration 21, inertia 10382.595329231895.
Iteration 22, inertia 10382.58875625161.
Converged at iteration 2

In [19]:
print("Top terms per cluster:")
vocab = tfidf.get_feature_names_out()

for i in range(kmeans.n_clusters):
    centroid = kmeans.cluster_centers_[i]    
    sorted_terms = centroid.argsort()[::-1]
    print(f"Cluster {i}:\t{[vocab[j] for j in sorted_terms[:5]]}")

Top terms per cluster:
Cluster 0:	['women', 'sitting', 'smiling', 'standing', 'sofa']
Cluster 1:	['sand', 'dog', 'running', 'brown', 'dogs']
Cluster 2:	['running', 'dog', 'grass', 'dogs', 'beach']
Cluster 3:	['people', 'group', 'sitting', 'table', 'restaurant']
Cluster 4:	['korea', 'north', 'south', 'nuclear', 'missiles']
Cluster 5:	['girl', 'new', 'boy', 'police', 'dead']
Cluster 6:	['dancing', 'woman', 'man', 'rain', 'group']
Cluster 7:	['percent', 'shares', 'points', 'nasdaq', 'index']
Cluster 8:	['iran', 'nuclear', 'talks', 'peace', 'deal']
Cluster 9:	['snow', 'dog', 'dogs', 'black', 'running']
Cluster 10:	['playing', 'guitar', 'man', 'flute', 'piano']
Cluster 11:	['man', 'slicing', 'cutting', 'walking', 'rope']
Cluster 12:	['drug', 'capital', 'trafficking', 'police', 'laws']
Cluster 13:	['riding', 'horse', 'man', 'woman', 'motorcycle']
Cluster 14:	['china', 'usd', 'yuan', 'stocks', 'strengthens']
Cluster 15:	['white', 'black', 'cat', 'cow', 'dog']
Cluster 16:	['dont', 'exist', 'fi

In [9]:
from gensim.models import Word2Vec

analyzer = tfidf.build_analyzer()
tokenized_corpus = [analyzer(w) for w in corpus]
model = Word2Vec(tokenized_corpus, vector_size=30, min_count=5, window=10)
print(len(model.wv))

2892


In [10]:
# todo fare con operazioni vettoriali
def mix(vectorizer: TfidfVectorizer, embeddings: Word2Vec, document) -> np.array:
    vectorized = vectorizer.transform([document])
    cols = vectorized.nonzero()[1]
    result = np.zeros(30)
    weights = 0
    terms = tfidf.get_feature_names_out()

    for x in cols:
        word = terms[x]
        try:
            embed = embeddings.wv[word]
        except:
            continue
        weight = vectorized[0, x]
        result += weight * embed
        weights += weight
        
    result = result / weights
    if (np.isnan(result).any() or weights == 0):
        return np.zeros(30)
    return result

In [25]:
mixed_corpus = np.array([mix(tfidf, model, w) for w in corpus])

  result = result / weights


(11498, 30)
Initialization complete
Iteration 0, inertia 298.8018460750988.
Iteration 1, inertia 219.55690562598215.
Iteration 2, inertia 214.6848562331368.
Iteration 3, inertia 212.857355852252.
Iteration 4, inertia 212.23368635999134.
Iteration 5, inertia 211.92467648426236.
Iteration 6, inertia 211.6976908654495.
Iteration 7, inertia 211.47532825632214.
Iteration 8, inertia 211.1146978506815.
Iteration 9, inertia 210.45744775429492.
Iteration 10, inertia 209.69076390074764.
Iteration 11, inertia 208.97420058796783.
Iteration 12, inertia 208.90743118556685.
Iteration 13, inertia 208.83627551403782.
Iteration 14, inertia 208.81854240977623.
Iteration 15, inertia 208.79941843379348.
Iteration 16, inertia 208.7858940110073.
Iteration 17, inertia 208.77601484051328.
Iteration 18, inertia 208.77324308712764.
Converged at iteration 18: center shift 1.1176360309289543e-06 within tolerance 1.626664907705768e-06.
Initialization complete
Iteration 0, inertia 300.90241853391024.
Iteration 1, in

In [45]:
k = 15
kmeans = KMeans(n_clusters=k, max_iter=100, n_init=2, verbose=True)
cluster_matrix = kmeans.fit_transform(mixed_corpus)

Initialization complete
Iteration 0, inertia 540.7065096318718.
Iteration 1, inertia 415.4677448122001.
Iteration 2, inertia 410.50759191062025.
Iteration 3, inertia 407.1811913459393.
Iteration 4, inertia 405.4516020070937.
Iteration 5, inertia 404.376890767678.
Iteration 6, inertia 403.62590788286354.
Iteration 7, inertia 403.05747903241394.
Iteration 8, inertia 402.33969978692943.
Iteration 9, inertia 401.26514013216115.
Iteration 10, inertia 400.4569337750206.
Iteration 11, inertia 400.14070382054194.
Iteration 12, inertia 399.9643201921365.
Iteration 13, inertia 399.86032076568824.
Iteration 14, inertia 399.81950869380347.
Iteration 15, inertia 399.7941575168503.
Iteration 16, inertia 399.75739545827275.
Iteration 17, inertia 399.72824703792094.
Iteration 18, inertia 399.70391499328633.
Iteration 19, inertia 399.6838494806509.
Iteration 20, inertia 399.67250939027207.
Iteration 21, inertia 399.6685164150547.
Iteration 22, inertia 399.6585553670636.
Iteration 23, inertia 399.654577

In [49]:
labels = np.argmax(cluster_matrix, axis=-1)

In [50]:
print("Top term per cluster:")

for i in range(kmeans.n_clusters):
    centroid = kmeans.cluster_centers_[i]
    print(model.wv.most_similar(centroid, topn=10))

Top term per cluster:
[('said', 0.9998720288276672), ('billion', 0.9996214509010315), ('tuesday', 0.9996185302734375), ('million', 0.9996104836463928), ('state', 0.9995715618133545), ('government', 0.9995687007904053), ('president', 0.9995319247245789), ('police', 0.9995316863059998), ('years', 0.9995228052139282), ('military', 0.9994965195655823)]
[('said', 0.9998518228530884), ('million', 0.9997186064720154), ('government', 0.9996440410614014), ('billion', 0.9996383786201477), ('tuesday', 0.9996023178100586), ('police', 0.9995832443237305), ('yesterday', 0.9995828866958618), ('state', 0.9995742440223694), ('military', 0.999573826789856), ('president', 0.9995699524879456)]
[('indian', 0.999377429485321), ('said', 0.999259352684021), ('dies', 0.9992164969444275), ('told', 0.9991896152496338), ('air', 0.9991471171379089), ('court', 0.9991379976272583), ('arrested', 0.9990209937095642), ('workers', 0.9990055561065674), ('tuesday', 0.9990054368972778), ('billion', 0.9989449977874756)]
[('

In [35]:
similarity_matrix2 = cosine_similarity(mixed_corpus)
print(similarity_matrix2.shape)

(11498, 11498)


In [42]:
print(np.max(similarity_matrix)) 

1.0000000000000007
