In [1]:
!pip3 install -q transformers datasets

In [2]:
!pip install ktrain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip3 uninstall flask -y
!pip3 install -q eli5

[0m

In [4]:
import torch
import numpy as np
import pandas as pd

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

In [6]:
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type != 'cuda':
    raise SystemError('GPU device not found')

In [8]:
def on_bad_line(values):
    return values[:7]

columns_mapping = {
    0: 'genre',
    1: 'filename',
    2: 'year',
    3: 'index',
    4: 'score',
    5: 'sentence1',
    6: 'sentence2'
}

# quotings 3 = csv.QUOTE_NONE
df = pd.read_csv('sts-train.csv', sep="\t", on_bad_lines=on_bad_line, engine='python', header=None, encoding='utf-8', quoting=3).rename(columns=columns_mapping)
print(f'shape of the Dataframe {df.shape}')
df.head(3)

shape of the Dataframe (5749, 7)


Unnamed: 0,genre,filename,year,index,score,sentence1,sentence2
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...


In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download("punkt")
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
sentences1 = df["sentence1"]
sentences2 = df["sentence2"]
raw_corpus = np.concatenate((sentences1, sentences2))

stop_words = stopwords.words('english')

In [11]:
def preprocess(sentence):
    result = re.sub(f'[{punctuation}]','',sentence).lower()
    result = re.sub('\W', ' ', result).split()
    return [w for w in result if w not in stop_words]

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
unique_corpus = list(set(raw_corpus[:5000]))
# Tokenize sentences
encoded_input = tokenizer(unique_corpus, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-0.0181, -0.0986, -0.0317,  ...,  0.0554,  0.0609, -0.0057],
        [ 0.0057, -0.0019,  0.0170,  ...,  0.0595,  0.0832,  0.0189],
        [ 0.0140,  0.0213, -0.0523,  ...,  0.0343,  0.0553, -0.0274],
        ...,
        [ 0.0520, -0.0173,  0.0426,  ...,  0.0072,  0.0264, -0.0105],
        [ 0.0170, -0.0710, -0.0445,  ..., -0.0774, -0.0491, -0.0099],
        [-0.0658,  0.0452,  0.0083,  ..., -0.0447,  0.0587,  0.0640]])


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import minmax_scale
similarity_matrix = cosine_similarity(sentence_embeddings)

In [28]:
def most_similar(sentence_index):
    all_sentence_combinations = []
    for j in range(0, len(similarity_matrix)):
            all_sentence_combinations.append([similarity_matrix[sentence_index][j], sentence_index, j])
    all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
    print("Sentence: " + unique_corpus[sentence_index])
    print("Top-30 most similar pairs:")
    for score, sentence_index, j in all_sentence_combinations[:10]:
        print("{} \t {:.4f}".format(unique_corpus[j], similarity_matrix[sentence_index][j]))

most_similar(2)
most_similar(21)
most_similar(4000)

Sentence: Doctors say one or both boys may die, and that some brain damage is possible if they survive.
Top-30 most similar pairs:
Doctors say one or both boys may die, and that some brain damage is possible if they survive. 	 1.0000
That would make you partially responsible for their death. 	 0.4026
2 dead, 2 injured in Nevada middle school shooting 	 0.3806
After 26 hours of surgery and a year of anticipation, the boys were separated Sunday at Children's Medical Center of Dallas. 	 0.3802
One dead in Ohio school shooting 	 0.3706
Teenager and father killed in crash 	 0.3618
Two boys are driving. 	 0.3580
13 children die in fire Myanmar mosque fire 	 0.3508
7 killed, 3 injured in south China road accident 	 0.3495
Two boys, one wearing a hat, in a playground. 	 0.3367
Sentence: alstom is in competition with japanese and german countries for the contract. 
Top-30 most similar pairs:
alstom is in competition with japanese and german countries for the contract.  	 1.0000
french energy an