<a href="https://colab.research.google.com/github/farnaz-frd/Closest_sentences-from-different-languages-with-laser-embedding/blob/main/top_K_closest_sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Laser embedding with cosine similarity

### Installing require liberaries

In [3]:
pip install laserembeddings

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting laserembeddings
  Using cached laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting transliterate==1.10.2
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses==0.0.35
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m859.8/859.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting subword-nmt<0.4.0,>=0.3.6
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting torch<2.0.0,>=1.0.1.post2
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collect

In [4]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.10/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [5]:
ls

de.unpar.txt  pt.unpar.txt  [0m[01;34msample_data[0m/


### Main part

In [6]:
import torch
from laserembeddings import Laser

laser = Laser()

# Load the sentences from the files
with open('de.unpar.txt', 'r') as f:
    de_sentences = f.read().splitlines()
with open('pt.unpar.txt', 'r') as f:
    pt_sentences = f.read().splitlines()

# Compute the embeddings
de_embeddings = laser.embed_sentences(de_sentences, lang='de')
pt_embeddings = laser.embed_sentences(pt_sentences, lang='pt')

# Compute the cosine similarity
similarity = torch.cosine_similarity(torch.tensor(de_embeddings).unsqueeze(1), torch.tensor(pt_embeddings).unsqueeze(0), dim=2)

# Get the top K similar sentences
K = 100
top_K_indices = torch.topk(similarity.flatten(), K).indices
top_K_indices = top_K_indices.tolist()
top_K_indices = [(i // similarity.shape[1], i % similarity.shape[1]) for i in top_K_indices]

# Write the results to a file
with open('output.txt', 'w') as f:
    for i, j in top_K_indices:
        f.write(f'{de_sentences[i]}\t{pt_sentences[j]}\n')

# Laser embedding with The approximate nearest neighbor search algorithm

### Installing require liberaries

In [8]:
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


### Main part


In [9]:
import faiss
import numpy as np
from laserembeddings import Laser

laser = Laser()

# Load the sentences from the files
with open('de.unpar.txt', 'r') as f:
    de_sentences = f.read().splitlines()
with open('pt.unpar.txt', 'r') as f:
    pt_sentences = f.read().splitlines()

# Compute the embeddings
de_embeddings = laser.embed_sentences(de_sentences, lang='de')
pt_embeddings = laser.embed_sentences(pt_sentences, lang='pt')

# Normalize the embeddings
de_embeddings = de_embeddings / np.linalg.norm(de_embeddings, axis=1, keepdims=True)
pt_embeddings = pt_embeddings / np.linalg.norm(pt_embeddings, axis=1, keepdims=True)

# Build a faiss index
d = de_embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(pt_embeddings.astype('float32'))

# Find the top k most similar pairs of sentences
K = 100
D, I = index.search(de_embeddings.astype('float32'), K)

# Write the results to a file
with open('output2.txt', 'w') as f:
    for i in range(K):
        f.write(f'{de_sentences[I[i][0]]}\t{pt_sentences[I[i][0]]}\n')