In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download("punkt")
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import re

[nltk_data] Downloading package punkt to /home/fabio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/fabio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def on_bad_line(values):
    return values[:7]

columns_mapping = {
    0: 'genre',
    1: 'filename',
    2: 'year',
    3: 'index',
    4: 'score',
    5: 'sentence1',
    6: 'sentence2'
}

# quotings 3 = csv.QUOTE_NONE
df = pd.read_csv('Dataset/sts-train.csv', sep="\t", on_bad_lines=on_bad_line, engine='python', header=None, encoding='utf-8', quoting=3).rename(columns=columns_mapping)
print(f'shape of the Dataframe {df.shape}')
df.head(10)

shape of the Dataframe (5749, 7)


Unnamed: 0,genre,filename,year,index,score,sentence1,sentence2
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.
5,main-captions,MSRvid,2012test,11,4.25,Some men are fighting.,Two men are fighting.
6,main-captions,MSRvid,2012test,12,0.5,A man is smoking.,A man is skating.
7,main-captions,MSRvid,2012test,13,1.6,The man is playing the piano.,The man is playing the guitar.
8,main-captions,MSRvid,2012test,14,2.2,A man is playing on a guitar and singing.,A woman is playing an acoustic guitar and sing...
9,main-captions,MSRvid,2012test,16,5.0,A person is throwing a cat on to the ceiling.,A person throws a cat on the ceiling.


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage
from tqdm import tqdm
import scipy.sparse as scs

In [21]:
sentences1 = df["sentence1"]
sentences2 = df["sentence2"]
sentences_list = np.concatenate((sentences1, sentences2))

stop_words = stopwords.words('english')

regex = f'[{punctuation}]'
clean_list = np.array([re.sub(regex,'',string).lower() for string in sentences_list])

# creazione della matrice di similarità
tfidf = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english')
tfidf_matrix = tfidf.fit_transform(clean_list)
similarity_matrix = cosine_similarity(tfidf_matrix)
vocab = tfidf.get_feature_names_out()

print(tfidf_matrix.shape)

(11498, 2878)


In [22]:
# Get the terms (columns) of the matrix
terms = tfidf.get_feature_names_out()
print(terms)

print(terms[466])

word_id = 466
filter = tfidf_matrix.getcol(word_id).nonzero()[0]
print(clean_list[filter])

['04' '07' '10' ... 'äô' 'äôs' 'äď']
cat
['a person is throwing a cat on to the ceiling'
 'a cat is rubbing against babys face' 'a cat is playing a key board'
 'a cat is playing a piano' 'a cat is playing keyboards'
 'a cat is licking a sucker' 'a cat is watching a tv'
 'the cat is drinking milk' 'a cat is jumping into a box'
 'a cat is sliding upside down under a couch'
 'a cat is trying to touch a dog' 'a cat rapidly swats a kid in the head'
 'a cat is playing in a box' 'a cat is drinking some water'
 'a cat is licking himself' 'a cat opens a drawer and climbs inside'
 'a cat is drinking milk' 'the cat played with a watermelon'
 'a cat stares into the camera' 'a cat swings on a ceiling fan'
 'a cat is playing the piano' 'a cat is playing'
 'a woman tosses a cat in the air and the cat hangs off a rod on the ceiling'
 'a white cat perched on a small wooden cabinet is looking outside the window'
 'a cat is playing with an antenna'
 'a cat opens a drawer and climbs inside' 'the cat shimm

In [23]:
docs = ['the black cat is on the plane, near the white cat', 'another cat is waiting for its food']
dt_matrix = tfidf.transform(docs)
print(dt_matrix)

  (0, 2803)	0.2952740316253059
  (0, 1900)	0.405078758431841
  (0, 1694)	0.3874740562788338
  (0, 466)	0.7129072478984504
  (0, 344)	0.30058535779332396
  (1, 2757)	0.6709395536042083
  (1, 1025)	0.5748842386096671
  (1, 466)	0.46834626891603315


In [24]:
few_sentences = sentences_list[::10]
vectorized = tfidf.transform(few_sentences)

k = 30
kmeans = KMeans(n_clusters=k, max_iter=100, n_init=2, verbose=True)
cluster_matrix = kmeans.fit_transform(vectorized)


Initialization complete
Iteration 0, inertia 1104.0041733560945.
Iteration 1, inertia 1081.8519635982448.
Iteration 2, inertia 1073.5176285529774.
Iteration 3, inertia 1069.377701068799.
Iteration 4, inertia 1068.2324757253043.
Iteration 5, inertia 1067.9297808855472.
Iteration 6, inertia 1067.900658299694.
Converged at iteration 6: strict convergence.
Initialization complete
Iteration 0, inertia 1904.076533842193.
Iteration 1, inertia 1031.951116357368.
Iteration 2, inertia 1026.7276519108816.
Iteration 3, inertia 1024.503855277684.
Iteration 4, inertia 1022.9320112379447.
Iteration 5, inertia 1022.2681572893601.
Iteration 6, inertia 1022.040947247824.
Converged at iteration 6: strict convergence.


In [25]:
print("Top terms per cluster:")
vocab = tfidf.get_feature_names_out()

for i in range(kmeans.n_clusters):
    centroid = kmeans.cluster_centers_[i]    
    sorted_terms = centroid.argsort()[::-1]
    print(f"Cluster {i}:\t{[vocab[j] for j in sorted_terms[:10]]}")

Top terms per cluster:
Cluster 0:	['playing', 'piano', 'man', 'flute', 'drums', 'violin', 'woman', 'plays', 'singing', 'person']
Cluster 1:	['obama', 'syria', 'opposition', 'hold', 'action', 'documents', 'talks', 'winning', 'cut', 'fly']
Cluster 2:	['afghanistan', 'soldier', 'killed', 'afghan', 'kills', 'soldiers', 'nato', 'suicide', 'blast', 'civilians']
Cluster 3:	['walking', 'dog', 'runs', 'grass', 'brown', 'man', 'white', 'water', 'carrying', 'mouth']
Cluster 4:	['cat', 'looking', 'camera', 'white', 'black', 'bed', 'window', 'grey', 'laying', 'sofa']
Cluster 5:	['running', 'dog', 'black', 'mouth', 'horse', 'snow', 'tan', 'brown', 'grass', 'beach']
Cluster 6:	['young', 'girl', 'boy', 'sitting', 'slide', 'couch', 'wearing', 'jeans', 'jumping', 'women']
Cluster 7:	['woman', 'cutting', 'percent', 'slicing', 'baby', 'parked', 'dancing', 'year', 'new', 'lady']
Cluster 8:	['state', 'arrives', 'islamic', 'visit', 'nuclear', 'moves', 'massacre', 'site', 'fight', 'hostage']
Cluster 9:	['weap