# Generate Embeddings

Let's show we have a way to extract semantic relationships from embeddings. Let's introduce this concept by first only working on article titles.

In [14]:
import pandas as pd
import numpy as np


In [15]:
# Load the articles
articles = pd.read_feather('Data/dataframes/article_dataframe.feather')

In [16]:
# Extract all names
article_names = articles['article']
article_names

0           Áedán_mac_Gabráin
1                       Åland
2               Édouard_Manet
3                        Éire
4       Óengus_I_of_the_Picts
                ...          
4599                  Zionism
4600                Zirconium
4601                Zoroaster
4602             Zuid-Gelders
4603                     Zulu
Name: article, Length: 4604, dtype: object

In [None]:
# https://huggingface.co/blog/getting-started-with-embeddings
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "insert token here"


In [18]:
import requests

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}


In [19]:
def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()


In [20]:
article_names_list = article_names.tolist()

In [21]:
#replace _ with space in article names
article_names_list = [name.replace("_", " ") for name in article_names_list]

In [22]:
article_names_list

['Áedán mac Gabráin',
 'Åland',
 'Édouard Manet',
 'Éire',
 'Óengus I of the Picts',
 '€2 commemorative coins',
 '10th century',
 '11th century',
 '12th century',
 '13th century',
 '14th century',
 '15th Marine Expeditionary Unit',
 '15th century',
 '16 Cygni',
 '16 Cygni Bb',
 '16th century',
 '1755 Lisbon earthquake',
 '17th century',
 '1896 Summer Olympics',
 '18th century',
 '1928 Okeechobee Hurricane',
 '1973 oil crisis',
 '1980 eruption of Mount St. Helens',
 '1997 Pacific hurricane season',
 '19th century',
 '1 Ceres',
 '1st century',
 '1st century BC',
 '2-6-0',
 '2-8-0',
 '2003 Atlantic hurricane season',
 '2004 Atlantic hurricane season',
 '2004 Indian Ocean earthquake',
 '2005 Atlantic hurricane season',
 '2005 Hertfordshire Oil Storage Terminal fire',
 '2005 Kashmir earthquake',
 '2005 Lake Tanganyika earthquake',
 '2005 Sumatra earthquake',
 '20th century',
 '21st century',
 '2nd century',
 '3 Juno',
 '3rd century',
 '4-2-0',
 '4-4-0',
 '4-6-0',
 '47 Ursae Majoris',
 '47 U

In [23]:
output = query(article_names_list)

In [24]:
articles['embeddings'] = output

In [25]:
articles

Unnamed: 0,article,article_unrendered_unicode,category,linkSource,linkTarget,distances,plain_text,embeddings
0,Áedán_mac_Gabráin,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,[subject.History.British_History.British_Histo...,Áedán_mac_Gabráin,"[Bede, Columba, Dál_Riata, Great_Britain, Irel...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...,"[-0.12923911213874817, 0.02362193539738655, -0..."
1,Åland,%C3%85land,"[subject.Countries, subject.Geography.European...",Åland,"[20th_century, Baltic_Sea, Crimean_War, Curren...","{'10th_century': 2.0, '11th_century': 2.0, '12...",#copyright\n\nÅland\n\n2007 Schools Wikiped...,"[-0.05364985018968582, -0.018478475511074066, ..."
2,Édouard_Manet,%C3%89douard_Manet,[subject.People.Artists],Édouard_Manet,"[Absinthe, Beer, Claude_Monet, Diego_Velázquez...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nÉdouard Manet\n\n2007 Schools...,"[-0.01709255389869213, 0.08353389799594879, -0..."
3,Éire,%C3%89ire,"[subject.Countries, subject.Geography.European...",Éire,"[Canada, English_language, George_VI_of_the_Un...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nÉire\n\n2007 Schools Wikipedi...,"[0.04833950847387314, 0.046594519168138504, 0...."
4,Óengus_I_of_the_Picts,%C3%93engus_I_of_the_Picts,[subject.History.British_History.British_Histo...,Óengus_I_of_the_Picts,"[Dál_Riata, Durham, England, Great_Britain, Ir...","{'10th_century': 2.0, '11th_century': 2.0, '12...",#copyright\n\nÓengus I of the Picts\n\n2007...,"[-0.07649108022451401, 0.10481206327676773, -0..."
...,...,...,...,...,...,...,...,...
4599,Zionism,Zionism,"[subject.People.Political_People, subject.Reli...",Zionism,"[18th_century, 19th_century, Adolf_Hitler, Alb...","{'10th_century': 2.0, '11th_century': 2.0, '12...",#copyright\n\nZionism\n\n2007 Schools Wikip...,"[-0.016020476818084717, 0.0894060954451561, -0..."
4600,Zirconium,Zirconium,[subject.Science.Chemistry.Chemical_elements],Zirconium,"[Aluminium, Arabic_language, Australia, Bicycl...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nZirconium\n\n2007 Schools Wik...,"[-0.10549122095108032, 0.023488083854317665, -..."
4601,Zoroaster,Zoroaster,[subject.People.Religious_figures_and_leaders],Zoroaster,"[18th_century, 9th_century, Afghanistan, Age_o...","{'10th_century': 2.0, '11th_century': 2.0, '12...",#copyright\n\nZoroaster\n\n2007 Schools Wik...,"[-0.06657274067401886, 0.12227798998355865, -0..."
4602,Zuid-Gelders,Zuid-Gelders,"[subject.Geography.European_Geography, subject...",Zuid-Gelders,"[Brabantian, Dutch_language, East_Flemish, Hol...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nZuid-Gelders\n\n2007 Schools ...,"[-0.173138827085495, 0.09522440284490585, 0.00..."


In [26]:
len(articles['embeddings'][0])

384

In [None]:
articles.to_feather('Data/dataframes/article_dataframe_with_embeddings.feather')