<a href="https://colab.research.google.com/github/cstorm125/align_use/blob/master/sandbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Universal Sentence Encoder for Sentence Alignment

In [0]:
# #get stuff ready for colab
# !pip install tensorflow_text
# !wget https://www.dropbox.com/s/nr5h3xuqhlvj94m/europarl-v7.fr-en.en
# !wget https://www.dropbox.com/s/z730widwr5kw0zm/europarl-v7.fr-en.fr

In [2]:
!ls

europarl-v7.fr-en.en	europarl-v7.fr-en.fr	sample_data
europarl-v7.fr-en.en.1	europarl-v7.fr-en.fr.1


In [0]:
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import tensorflow_text
import tensorflow as tf #tensorflow 2.1.0

_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3')

class LSHasher:
    def __init__(self,hash_size = 10, input_dim = 512):
        self.hash_size = hash_size
        self.input_dim = input_dim
        self.projections = tf.random.normal((self.input_dim, self.hash_size))
    def get_hash(self,inp):
        res = inp@self.projections
        res = tf.cast(res > 0,tf.int32).numpy().astype(str)
        res = [''.join(i) for i in res]
        return res

def get_alignment(all_emb,input_idx):
    ref_idx = tf.constant(input_idx)[:,None]
    sub_emb = tf.gather_nd(params=all_emb, indices=ref_idx)
    x_ = tf.nn.l2_normalize(sub_emb,axis=1)
    res = x_@tf.transpose(x_)
    res = tf.linalg.set_diag(res, tf.constant(-2, dtype=float, shape=len(input_idx)))
    max_idx = tf.argmax(res,0).numpy()
    max_values = tf.reduce_max(res,0).numpy()
    max_input_idx = [input_idx[i] for i in max_idx]
    return max_input_idx, max_values

## Small Baseline with `europarl fr-en`

In [6]:
#read files
with open('europarl-v7.fr-en.fr','r') as f: fr = f.readlines()
with open('europarl-v7.fr-en.en','r') as f: en = f.readlines()
len(fr), len(en)

(2007723, 2007723)

In [15]:
#create dataframe
df = pd.DataFrame({'fr':[i[:-1] for i in fr],'en':[i[:-1] for i in en]})
#deduplicate based on fr
df['rnk'] = df.groupby('fr').cumcount()
df = df[df.rnk==0].drop('rnk',1).reset_index(drop=True)
df.tail()

Unnamed: 0,fr,en
1954617,Je donne la parole à M. Manders pour une motio...,Mr Manders has the floor for a procedural motion.
1954618,"Monsieur le Président, je voudrais profiter de...","Mr President, I would like to take this opport..."
1954619,"Mesdames et Messieurs les Députés, avant votre...","Ladies and gentlemen, before you leave me alon..."
1954620,"Je me permettrai même, bien qu'ils soient abse...","I would also like, although they are absent, t..."
1954621,Je ne rouvrirai pas le débat sur le millénaire...,I am not going to re-open the 'Millennium or n...


In [24]:
#randomize a subset of 1000 deduplicated sentences
rand_idx = np.random.choice(df.index, size=1000)

#get use
%time fr_emb = _model(df['fr'][rand_idx])
%time en_emb = _model(df['en'][rand_idx])
all_emb = tf.concat([fr_emb,en_emb],0)
all_emb.shape

CPU times: user 155 ms, sys: 2.88 ms, total: 158 ms
Wall time: 140 ms
CPU times: user 136 ms, sys: 2.9 ms, total: 139 ms
Wall time: 115 ms


TensorShape([2000, 512])

In [25]:
#test with first 1000 rows of deduped fr-en europarl
input_idx = [i for i in range(2000)]
max_input_idx, max_values = get_alignment(all_emb,input_idx)
res_df = pd.DataFrame({'src_idx':input_idx, 'targ_idx':max_input_idx,'score':max_values})
res_df

Unnamed: 0,src_idx,targ_idx,score
0,0,1000,0.720966
1,1,1001,0.724499
2,2,1002,0.712397
3,3,1003,0.770342
4,4,1004,0.749140
...,...,...,...
1995,1995,995,0.845170
1996,1996,996,0.860101
1997,1997,997,0.875563
1998,1998,998,0.831155


In [26]:
#fr to en
(res_df.iloc[:1000,:].src_idx==res_df.iloc[:1000,:].targ_idx-1000).mean()

0.981

In [27]:
#en to fr
(res_df.iloc[1000:,:].src_idx==res_df.iloc[1000:,:].targ_idx+1000).mean()

0.982

## Dealing with Large Datasets

In [0]:
#get use
%time fr_emb = _model(df['fr'])
%time en_emb = _model(df['en'])
all_emb = tf.concat([fr_emb,en_emb],0)
all_emb.shape

In [29]:
#hash everything
lsh = LSHasher(5,512)
%time all_lsh = lsh.get_hash(all_emb)
len(all_lsh)

CPU times: user 11.6 ms, sys: 989 µs, total: 12.6 ms
Wall time: 18.9 ms


2000

In [15]:
#create hash dataframe
d = pd.DataFrame({'lsh':all_lsh,'texts':all_texts})
d

Unnamed: 0,lsh,texts
0,1100101100,Reprise de la session
1,1001000101,Je déclare reprise la session du Parlement eur...
2,1011000110,"Comme vous avez pu le constater, le grand ""bog..."
3,1100101100,Vous avez souhaité un débat à ce sujet dans le...
4,0011001100,"En attendant, je souhaiterais, comme un certai..."
...,...,...
19995,0101000000,"The present CAP, however, is worse than its pr..."
19996,0111100110,The selection criteria and the activities that...
19997,0011100100,"In no case, however, are they programmes for t..."
19998,0001100110,The actual appropriations available for LEADER...


In [17]:
i = 0
sub_d = d[d.lsh==d.loc[i,'lsh']]

Unnamed: 0,lsh,texts
0,1100101100,Reprise de la session
3,1100101100,Vous avez souhaité un débat à ce sujet dans le...
94,1100101100,"Toujours au sujet de la journée du mercredi, j..."
1049,1100101100,Je me demande pourtant si nous ne courons pas ...
1139,1100101100,Il a déclaré que nous ne devions pas toujours ...
...,...,...
16654,1100101100,In the event we had to have several meetings b...
17299,1100101100,Some businesses have become hi-tech businesses...
18017,1100101100,Last Thursday speaking times were extremely ti...
19568,1100101100,Is this really how we will answer the fundamen...


In [22]:
sub_idx = d.
emb_one = all_emb[i,:][None,:]

emb_one.shape

TensorShape([1, 512])

In [105]:
new_df = pd.DataFrame({
    'text':list(df['fr'][:10000])+list(df['en'][:10000]), 
    'lsh':fr_lsh+en_lsh})
new_df

Unnamed: 0,text,lsh
0,Reprise de la session,1000000011
1,Je déclare reprise la session du Parlement eur...,1111101010
2,"Comme vous avez pu le constater, le grand ""bog...",1111011011
3,Vous avez souhaité un débat à ce sujet dans le...,1010110111
4,"En attendant, je souhaiterais, comme un certai...",0111101001
...,...,...
19995,"The present CAP, however, is worse than its pr...",1111111110
19996,The selection criteria and the activities that...,1111001010
19997,"In no case, however, are they programmes for t...",1010100010
19998,The actual appropriations available for LEADER...,0110111000


In [111]:
new_df[new_df.lsh=='0000000101'].text[7481]

"Une meilleure analyse des obstacles que rencontrent les femmes permettra d' élaborer une stratégie pour les éliminer."

In [113]:
df[df.fr=="Une meilleure analyse des obstacles que rencontrent les femmes permettra d' élaborer une stratégie pour les éliminer."]

Unnamed: 0,fr,en
7481,Une meilleure analyse des obstacles que rencon...,Improved analysis of the obstacles facing wome...


In [59]:
nb_tables = 5
lshs = [LSHasher(10,512) for i in range(nb_tables)]
fake_emb = tf.random.normal([1000000,512])
%time hashes = [lshs[i].get_hash(fake_emb) for i in range(5)]
len(hashes)

CPU times: user 45.5 s, sys: 386 ms, total: 45.9 s
Wall time: 45.9 s


5

In [0]:
df = pd.DataFrame({'h0':l,'c':1})

In [89]:
df.groupby('h0').count().sort_values('c', ascending=False)

Unnamed: 0_level_0,c
h0,Unnamed: 1_level_1
0001110001,58
1000100001,56
0000100001,45
1001100001,43
0100101011,40
...,...
0110000111,1
1101111111,1
1110011110,1
1110011100,1
