In [1]:
import pandas as pd

from sentence_transformers import SentenceTransformer, models

In [2]:
import torch

torch.cuda.empty_cache()

In [3]:
use_cuda = torch.cuda.is_available()
print (use_cuda)

True


In [4]:
if use_cuda:
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8200
__Number CUDA Devices: 1
__CUDA Device Name: NVIDIA GeForce RTX 3070 Ti
__CUDA Device Total Memory [GB]: 8.589410304


In [5]:
###### CREATE MODEL ######
max_seq_length = 64
train_batch_size = 32

# Load teacher model
print("Load teacher model")
teacher_model = SentenceTransformer('stsb-roberta-base-v2')

# Create student model
print("Create student model")
word_embedding_model = models.Transformer("xlm-roberta-base")

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device='cuda')

Load teacher model
Create student model


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
#### Read Datasets ####

df1 = pd.read_csv('C:/Users/USER/Desktop/Knowledge Distillaation/1 Datasets/Compiled Datasets/EN_TL_compiled_400k.csv')
df2 = pd.read_csv('C:/Users/USER/Desktop/Knowledge Distillaation/1 Datasets/Compiled Datasets/STS_TL_400k.csv')
#df3 = pd.read_csv('datasets/translate_s2.txt', delimiter = '\t', encoding = 'unicode_escape')


In [7]:
df1

Unnamed: 0,tl,en
0,"Ayon naman kay Bayan Muna Rep. Carlos Zarate, ...","According to Bayan Muna Rep. Carlos Zarate, Ch..."
1,Ayon naman sa Philippine Embassy sa Washington...,According to the Philippine Embassy in Washing...
2,Paano maipagtatanggol ng mga ordinaryong mamam...,How do ordinary citizens defend themselves fro...
3,SM by the bay,SM by the bay
4,"MANILA, Philippines - Inirekomenda ng Departme...","Manila, Philippines - The Department of Interi..."
...,...,...
399995,"""The President will discuss with Japanese Prim...","""The President Will Discuss With Japanese Prim..."
399996,Matatandaang idiniskwalipika ng Commission on ...,The Commission on Elections (COMELEC) (COMELEC...
399997,"Dahil sa huling score, inilarawan ng Heritage ...","Due to the last score, the Heritage Foundation..."
399998,"MANILA, Philippines -- Wala umanong karapatan ...","Manila, Philippines - The Government of Puerto..."


In [8]:
df2

Unnamed: 0,s1,s2,label
0,"""Hindi ko ugali ang mamulitika; mas gusto kong...","Ito ang dineklara ni Atty. Romulo Macalintal, ...",1
1,"Ayon naman kay Bayan Muna Rep. Carlos Zarate, ...",Dating itinutulak ni Duterte ang pagbabago ng ...,0
2,Ayon naman sa Philippine Embassy sa Washington...,"Ayon sa NBI, hindi umano siyento por siyentong...",1
3,Paano maipagtatanggol ng mga ordinaryong mamam...,"""Nakasaad sa R.A. 9009 na dapat mayroong land ...",1
4,SM by the bay,Kabilang sa mga sumali sa programang ito ang m...,1
...,...,...,...
419995,2p.m. Arellano University vs Hog's Breath Cafe,Inirekomenda sa Pangulo ni Albay Rep. Joey Sal...,1
419996,Jason Davee: We rather want the senate head to...,"Sabi naman ni Eberl, gumagana pa rin naman uma...",1
419997,Ipinatupad ang liquid ban sa mga istasyon ng M...,Gumastos naman ng P11.587 milyon ang mga contr...,1
419998,Team Standings: zArellano (13-4); zSan Beda (1...,z - Final Four twice-to-beat,0


In [9]:
from sentence_transformers.datasets import ParallelSentencesDataset
from torch.utils.data import DataLoader
from sentence_transformers import SentencesDataset, losses, evaluation, readers


###### Load train sets ######

train_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
train_reader.add_dataset(df1.values.tolist())
#load_data('translate_s1.txt', encoding= 'unicode_escape')
train_dataloader = DataLoader(train_reader, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)

In [10]:
###### Load test sets Mean Squared Error (MSE) measures ######

evaluators = []

test_mse = evaluation.MSEEvaluator(df1['en'].values.tolist(),df1['tl'].values.tolist(), teacher_model=teacher_model, batch_size=train_batch_size, show_progress_bar=True, write_csv = True)
evaluators.append(test_mse)

Batches:   0%|          | 0/12500 [00:00<?, ?it/s]

In [11]:
###### Load dev sets for Semantic Textual Similarity (STS) data ######

evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(df2['s1'], df2['s2'], df2['label'], batch_size=train_batch_size, show_progress_bar=True, write_csv=True)
evaluators.append(evaluator_sts)

In [12]:
evaluator_trans = evaluation.TranslationEvaluator(df1['en'].values.tolist(),df1['tl'].values.tolist(), show_progress_bar = True, batch_size = train_batch_size, write_csv=True)
evaluators.append(evaluator_trans)

In [13]:
###### Train model ######
#torch.cuda.empty_cache()
import datetime

output_path = "output/model-" + datetime.date.today().strftime("%Y-%m-%d")
model.fit(train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
        epochs=10,
        evaluation_steps=1000,
        warmup_steps=10000,
        scheduler='warmupconstant',
        output_path=output_path,
        save_best_model=True,
        optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
        )



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6087 [00:00<?, ?it/s]

  labels = torch.tensor(labels).to(self._target_device)


RuntimeError: CUDA out of memory. Tried to allocate 734.00 MiB (GPU 0; 8.00 GiB total capacity; 4.64 GiB already allocated; 0 bytes free; 5.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [15]:
 torch.cuda.memory_summary(device=None, abbreviated=False)



In [14]:
torch.cuda.max_memory_allocated(device=None)

5961180160

In [None]:
evaluators

In [None]:
import scipy.spatial

#Corpus with example sentences
corpusEN = df3['EN'][:20].values.tolist()
corpusTL = df3['TL'][:20].values.tolist()

corpusEN_embeddings = model.encode(corpusEN)
corpusTL_embeddings = model.encode(corpusTL)

In [None]:
queries = df1['EN'][88:89].values.tolist()
query_embeddings = model.encode(queries)

In [None]:
closest_n = 5

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpusEN_embeddings, "cosine")[0]
    
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    
    print("\n=======\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:\n")

    for idx, distance in results[0:closest_n]:
        print(corpusEN[idx].strip(), "(Score: %.4f)" % (1-distance))

In [None]:
queriesTL = df1['TL'][88:89].values.tolist()
#queriesTL = ['gobyerno.']
queryTL_embeddings = model.encode(queriesTL)

In [None]:
closest_n = 5
for query, queryTL_embedding in zip(queriesTL, queryTL_embeddings):
    distances = scipy.spatial.distance.cdist([queryTL_embedding], corpusTL_embeddings, "cosine")[0]
    
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    
    print("\n=======\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:\n")

    for idx, distance in results[0:closest_n]:
        print(corpusTL[idx].strip(), "(Score: %.4f)" % (1-distance))

### after 100 epocs of training: Query Search Result

In [None]:
import scipy.spatial

#Corpus with example sentences
corpusEN = df3['EN'][:20].values.tolist()
corpusTL = df3['TL'][:20].values.tolist()

with strategy.scope():
    corpusEN_embeddings = model.encode(corpusEN)
    corpusTL_embeddings = model.encode(corpusTL)

In [None]:
queries = df1['EN'][88:89].values.tolist()
query_embeddings = model.encode(queries)

In [None]:
closest_n = 5

with strategy.scope():
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpusEN_embeddings, "cosine")[0]
    
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
    
        print("\n=======\n")
        print("Query:", query)
        print("\nTop 5 most similar sentences in corpus:\n")

        for idx, distance in results[0:closest_n]:
            print(corpusEN[idx].strip(), "(Score: %.4f)" % (1-distance))

In [None]:
queriesTL = df1['TL'][88:89].values.tolist()
#queriesTL = ['Balita tungkol sa transportasyon, gobyerno ng Pilipinas at iba pa.']
queryTL_embeddings = model.encode(queriesTL)

In [None]:
closest_n = 5
for query, queryTL_embedding in zip(queriesTL, queryTL_embeddings):
    distances = scipy.spatial.distance.cdist([queryTL_embedding], corpusTL_embeddings, "cosine")[0]
    
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    
    print("\n=======\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:\n")

    for idx, distance in results[0:closest_n]:
        print(corpusTL[idx].strip(), "(Score: %.4f)" % (1-distance))