# DNS embeddings

## 1 - Load the embeddings

In [1]:
from gensim.models.wrappers.fasttext import FastText as ft
dns_embeddings = ft.load('models/ft/21epoc_minn11_maxn17')

## 2 - Get predictions

In [2]:
dns_embeddings.most_similar('subrayado.com.uy', topn=15)

[('subrayado.com', 0.9160100221633911),
 ('diariolarepublica.net', 0.8355216979980469),
 ('eldiario.com.uy', 0.807044267654419),
 ('lr21.com.uy', 0.7994014024734497),
 ('teledoce.com', 0.7916869521141052),
 ('elecodigital.com.uy', 0.7739754915237427),
 ('causaabierta.com.uy', 0.7702589631080627),
 ('unoticias.com.uy', 0.7664780616760254),
 ('radiouruguay.com.uy', 0.7660830020904541),
 ('uypress.net', 0.742232084274292),
 ('sangregoriodepolancodigital.com.uy', 0.7303887605667114),
 ('vivomontevideo.com', 0.710254430770874),
 ('elnacional.com.uy', 0.7085186243057251),
 ('carnavaldeluruguay.com', 0.7016206979751587),
 ('informarte.com.uy', 0.7008339166641235)]

In [3]:
dns_embeddings.most_similar('autoblog.com.uy', topn=15)

[('gonzalorodriguez.org', 0.8535780906677246),
 ('autoanuario.com.uy', 0.8447151184082031),
 ('mundoautomotor.com.ar', 0.822021484375),
 ('cochesyconcesionarios.com', 0.8198953866958618),
 ('area75.com.ar', 0.8057596683502197),
 ('autos-chinos.com', 0.80348140001297),
 ('suzuki.com.uy', 0.7814224362373352),
 ('peugeot.com.uy', 0.7785277366638184),
 ('gonzaloruiz.com.uy', 0.774579644203186),
 ('masautos.com.uy', 0.7742289304733276),
 ('autosenuruguay.com', 0.7687051892280579),
 ('mundoautomotor.com', 0.7672389149665833),
 ('rcristofano.com', 0.7585819959640503),
 ('autoschinos.com.uy', 0.7531734108924866),
 ('chana.com.uy', 0.7527319192886353)]

In [4]:
dns_embeddings.most_similar('pornhub.com', topn=15)

[('youporn.com', 0.8787915110588074),
 ('phncdn.com', 0.8403894305229187),
 ('tube8.com', 0.7953190803527832),
 ('youporn.com.es', 0.7584182620048523),
 ('videospornhub.com', 0.7084945440292358),
 ('xxxcupid.com', 0.6958621740341187),
 ('german-youporn.com', 0.6955263614654541),
 ('pornhubpremium.com', 0.6934264302253723),
 ('genericlink.com', 0.6869221925735474),
 ('youporngay.com', 0.6804925203323364),
 ('videosxxxgratis.info', 0.6801930665969849),
 ('doublepimp.com', 0.6562666893005371),
 ('conejox.com', 0.6490557193756104),
 ('redirect.mysponsor.tv', 0.648033082485199),
 ('cerdas.com', 0.6374837160110474)]

## 3 - Analogical reasoning with DNS vectors

In [5]:
# v1 = atlantida.com.uy (site related to Atlantida, the main resort in Canelones city )
# v2 = maldonado.gub.uy (site for the Maldonado city government) 
# v3 = canelones.gub.uy puntaweb.com (site for the Canelones city government) 

# v1 + v2 - v3 ~= puntaweb.com (site related to Punta del Este, the main resort in Maldonado city)
# v1 + v2 - v3 ~= puntadeleste.com (site related to Punta del Este, the main resort in Maldonado city)

dns_embeddings.most_similar(positive=['atlantida.com.uy', 'maldonado.gub.uy'], negative=['canelones.gub.uy'], topn=3)

[('destinoatlantida.com', 0.7901754379272461),
 ('puntaweb.com', 0.7796562910079956),
 ('puntadeleste.com', 0.7146644592285156)]

In [6]:
# v1 = puntashopping.com.uy (site for a shopping center in Maldonado city)
# v2 = montevideo.gub.uy (site for the Montevideo city government)
# v3 = maldonado.gub.uy puntaweb.com (site for the Maldonado city government)

# v1 + v2 - v3 ~= tiendasmontevideo.com (site for shopping center in Montevideo city)
# v1 + v2 - v3 ~= montevideoshopping.com.uy (site for shopping center in Montevideo city)

In [7]:
dns_embeddings.most_similar(positive=['puntashopping.com.uy', 'montevideo.gub.uy'], negative=['maldonado.gub.uy'], topn=3)

[('tiendasmontevideo.com', 0.6983073949813843),
 ('pedidosya.com.uy', 0.6858967542648315),
 ('montevideoshopping.com.uy', 0.6752519607543945)]

## 4 - Support for names that are out of the vocabulary

In [8]:
dns_embeddings.most_similar('samtanderuniversidades.con.uy', topn=15)

[('santanderuniversidades.com.uy', 0.9953925013542175),
 ('bancamovilsantander.com.uy', 0.9528511762619019),
 ('santander.com.uy', 0.9180302619934082),
 ('multidiscount.net', 0.8109256029129028),
 ('bcu.gub.uy', 0.8083128333091736),
 ('discbank.com.uy', 0.8015027046203613),
 ('browserforthebetter.com', 0.7846260070800781),
 ('brou.com.uy', 0.7511576414108276),
 ('nbc.com.uy', 0.7493535280227661),
 ('intermedia.com.uy', 0.7440301179885864),
 ('bbvabanco.com.uy', 0.7431423664093018),
 ('bandes.com.uy', 0.7383270263671875),
 ('santander.com', 0.7364956140518188),
 ('pb-santander.com', 0.7321802377700806),
 ('prestamoshipotecarios.com.uy', 0.7187671661376953)]

## 5 - Other operations with the embeddings

In [9]:
dns_embeddings.similarity('elpais.com.uy', 'lr21.com.uy')

0.5719542606249806

In [10]:
dns_embeddings.similarity('elpais.com.uy', 'observador.com.uy')

0.6919086171657831

In [11]:
embedding = dns_embeddings['montevideo.gub.uy']
print(embedding)

[ 2.61009    -4.1921687   2.9463909   1.1478273   1.1328418  -0.12903813
  2.251682   -0.4214612  -1.1774279  -0.86958855  1.7774059  -0.39260012
 -0.16472146 -0.15398341  2.9564018   1.9138947   0.9554471  -0.09910203
 -1.7601196  -3.1151419   0.5680816   3.3717134   1.3116078   0.32379758
 -0.41205385  3.0059516   0.28583077  1.9796828  -1.0896446  -2.1190948
  1.8653215  -2.7476063  -1.5081612  -0.41755986 -0.16186    -0.94643617
  1.6081628   0.9016567   0.8466317  -0.48339814 -1.3045565  -0.06088967
 -0.460761    0.53961706  1.9750918   1.5561733   1.1335715  -0.83484936
  0.3765009  -0.58358943  2.3282328  -0.05932061 -1.8372589  -2.7496917
 -3.9053473   0.04385379 -0.27176103 -0.70694995 -0.14748901 -1.2158818
  1.0473628   1.6690867  -0.7001261  -3.4548273 ]


In [12]:
embedding.shape

(64,)

## 6 - References

[1] W. Lopez, <i>"Vector representation of Internet domain names using word embedding techniques,"</i> 
M.S. thesis, Instituto de Computación, Facultad de Ingenierı́a, Universidad de la República, Montevideo, Uruguay, 2019.

[2] W. Lopez, J. Merlino and P. Rodriguez-Bocca, <i>"Extracting semantic information from Internet Domain Names using word embeddings"</i>,
submitted to Engineering Applications of Artificial Intelligence (ELSEVIER), 2019.

[3] W. Lopez, J. Merlino and P. Rodriguez-Bocca, <i>"Vector representation of internet domain names using a word embedding technique,"</i> 
2017 XLIII Latin American Computer Conference (CLEI), Cordoba, 2017, pp. 1-8.
