# Title Embeddings

Before using this notebook, make sure to run `create_title_embeddings.ipynb`

In [1]:
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

In [2]:
articles = pd.read_feather('Data/dataframes/article_dataframe.feather')

In [3]:
articles

Unnamed: 0,article,article_unrendered_unicode,category,linkSource,linkTarget,distances,plain_text,embeddings
0,Áedán_mac_Gabráin,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,[subject.History.British_History.British_Histo...,Áedán_mac_Gabráin,"[Bede, Columba, Dál_Riata, Great_Britain, Irel...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...,"[-0.12923911213874817, 0.02362193539738655, -0..."
1,Åland,%C3%85land,"[subject.Countries, subject.Geography.European...",Åland,"[20th_century, Baltic_Sea, Crimean_War, Curren...","{'10th_century': 2.0, '11th_century': 2.0, '12...",#copyright\n\nÅland\n\n2007 Schools Wikiped...,"[-0.05364985018968582, -0.018478475511074066, ..."
2,Édouard_Manet,%C3%89douard_Manet,[subject.People.Artists],Édouard_Manet,"[Absinthe, Beer, Claude_Monet, Diego_Velázquez...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nÉdouard Manet\n\n2007 Schools...,"[-0.01709255389869213, 0.08353389799594879, -0..."
3,Éire,%C3%89ire,"[subject.Countries, subject.Geography.European...",Éire,"[Canada, English_language, George_VI_of_the_Un...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nÉire\n\n2007 Schools Wikipedi...,"[0.04833950847387314, 0.046594519168138504, 0...."
4,Óengus_I_of_the_Picts,%C3%93engus_I_of_the_Picts,[subject.History.British_History.British_Histo...,Óengus_I_of_the_Picts,"[Dál_Riata, Durham, England, Great_Britain, Ir...","{'10th_century': 2.0, '11th_century': 2.0, '12...",#copyright\n\nÓengus I of the Picts\n\n2007...,"[-0.07649108022451401, 0.10481206327676773, -0..."
...,...,...,...,...,...,...,...,...
4599,Zionism,Zionism,"[subject.People.Political_People, subject.Reli...",Zionism,"[18th_century, 19th_century, Adolf_Hitler, Alb...","{'10th_century': 2.0, '11th_century': 2.0, '12...",#copyright\n\nZionism\n\n2007 Schools Wikip...,"[-0.016020476818084717, 0.0894060954451561, -0..."
4600,Zirconium,Zirconium,[subject.Science.Chemistry.Chemical_elements],Zirconium,"[Aluminium, Arabic_language, Australia, Bicycl...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nZirconium\n\n2007 Schools Wik...,"[-0.10549122095108032, 0.023488083854317665, -..."
4601,Zoroaster,Zoroaster,[subject.People.Religious_figures_and_leaders],Zoroaster,"[18th_century, 9th_century, Afghanistan, Age_o...","{'10th_century': 2.0, '11th_century': 2.0, '12...",#copyright\n\nZoroaster\n\n2007 Schools Wik...,"[-0.06657274067401886, 0.12227798998355865, -0..."
4602,Zuid-Gelders,Zuid-Gelders,"[subject.Geography.European_Geography, subject...",Zuid-Gelders,"[Brabantian, Dutch_language, East_Flemish, Hol...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nZuid-Gelders\n\n2007 Schools ...,"[-0.173138827085495, 0.09522440284490585, 0.00..."


# Embedding Projector

Embedding Projector is a useful tool made by TF to look at word embedding spaces. Let's save our data so it can be opened there.

https://projector.tensorflow.org/

Note that the embedding projector can also be used locally.

In [4]:
# Separate embeddings as a new DataFrame
embeddings_df = pd.DataFrame(articles['embeddings'].tolist())

# Save embeddings to TSV
embeddings_df.to_csv('Data/embeddings/embeddings.tsv', sep='\t', header=False, index=False)

In [6]:
# Save metadata to TSV
metadata_df = articles[['article']]
metadata_df.to_csv('metadata.tsv', sep='\t', header=False, index=False)

In [10]:
articles[articles['article'] == 'Code_of_Hammurabi']

Unnamed: 0,article,article_unrendered_unicode,category,linkSource,linkTarget,distances,plain_text,embeddings
958,Code_of_Hammurabi,Code_of_Hammurabi,[subject.History.Ancient_History_Classical_His...,Code_of_Hammurabi,"[Agriculture, France, God, Hammurabi, Iran, La...","{'10th_century': 2.0, '11th_century': 3.0, '12...",#copyright\n\nCode of Hammurabi\n\n2007 Sch...,"[-0.04269905760884285, 0.1044374480843544, -0...."


In [None]:


matrix = np.array(articles['embeddings'].tolist())

# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)

colors = ["red", "darkorange", "gold", "turquiose", "darkgreen"]
x = [x for x,y in vis_dims]
y = [y for x,y in vis_dims]
# color_indices = df.Score.values - 1

colormap = matplotlib.colors.ListedColormap(colors)
plt.scatter(x, y, cmap=colormap, alpha=0.3)
plt.title("Article Names visualized using t-SNE")