## Imports

In [2]:
import os
import re
import numpy as np
import pandas as pd
import requests
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
# for nbviewer: 
# import plotly.io as pio
# pio.renderers.default = "notebook_connected"

# RIP Warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message="elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison")
warnings.filterwarnings("ignore", category=FutureWarning, message="'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.")

## Functions

In [3]:
# Grab Corpus from GitHub
def create_corpus_from_github_api(url):
  # URL on the Github where the csv files are stored
  github_url = url
  response = requests.get(github_url)

  corpus = []
  # Check if the request was successful
  if response.status_code == 200:
    files = response.json()
    for file in files:
      if file["download_url"][-3:] == "csv":
        corpus.append(pd.read_csv(file["download_url"], encoding="utf-8", index_col="Unnamed: 0").fillna(""))
  else:
    print('Failed to retrieve files:', response.status_code)

  return corpus

# Grab Metadata from Github
def get_metadata_from_raw_github(url):
  metadata = pd.read_csv(url, encoding="utf-8", index_col="Unnamed: 0").fillna("")
  return metadata

# Rinap01 + Rinap05 als df
corpus = create_corpus_from_github_api('https://api.github.com/repos/DigitalPasts/ALP-course/contents/course_notebooks/data/rinap01')
corpus.extend(create_corpus_from_github_api('https://api.github.com/repos/DigitalPasts/ALP-course/contents/course_notebooks/data/rinap05'))

# print first rows table:
corpus[0].head()

Unnamed: 0,ref,inst,frag,norm,cf,sense,pos,unicode,unicode_word,reading,break,break_perc,mask,lang,text,line,word
9515,Q003414.2.1,perʾu[bud//offspring]N$perʾi,NUNUZ,perʾi,perʾu,offspring,N,['𒉭'],𒉭,NUNUZ,['complete'],0.0,,,Q003414,2,1
9516,Q003414.2.2,Baltil[Aššur]QN,bal-til{ki},Baltil,Baltil,Aššur,QN,"['𒁄', '𒌀', '𒆠']",𒁄𒌀𒆠,bal-til{KI},"['complete', 'complete', 'complete']",0.0,,,Q003414,2,2
9517,Q003414.2.3,šūquru[very valuable]AJ,šu-⸢qu-ru⸣,šūquru,šūquru,very valuable,AJ,"['𒋗', '𒄣', '𒊒']",𒋗𒄣𒊒,šu-⸢qu-ru⸣,"['complete', 'damaged', 'damaged']",0.33,,,Q003414,2,3
9518,Q003414.2.4,narām[loved one]N,na-ram,narām,narāmu,loved one,N,"['𒈾', '𒉘']",𒈾𒉘,na-ram,"['complete', 'complete']",0.0,,,Q003414,2,4
9519,Q003414.2.5,u,{d}[(...),,,,u,"['𒀭', 'x']",𒀭x,{d}[x],"['complete', 'missing']",0.5,,,Q003414,2,5


In [4]:
# extract text_ids (list of unique ids)
text_ids = []
for text in corpus:
  text_ids.append(text["text"].iloc[0])
# .iloc[0] to select the first row

# print(text_ids)

In [5]:
# Akkadian metadata
metadata = get_metadata_from_raw_github("https://raw.githubusercontent.com/DigitalPasts/ALP-course/master/course_notebooks/data/rinap1_5_metadata.csv")

for id in text_ids:
  if id not in metadata.index:
    print(f"Text {id} missing from metadata")

# Only grab metadata for texts with IDs
metadata = metadata[metadata.index.isin(text_ids)]
#metadata

In [6]:
def split_df_by_column_value(df, column):

    dfs = []
    column_values = df[column].unique()
    for value in column_values:
        split_df = df[df[column]==value]
        dfs.append(split_df)
    return dfs

split_df_by_column_value(corpus[0].head(), "line")

[              ref                          inst         frag    norm      cf  \
 9515  Q003414.2.1  perʾu[bud//offspring]N$perʾi        NUNUZ   perʾi   perʾu   
 9516  Q003414.2.2               Baltil[Aššur]QN  bal-til{ki}  Baltil  Baltil   
 9517  Q003414.2.3       šūquru[very valuable]AJ   šu-⸢qu-ru⸣  šūquru  šūquru   
 9518  Q003414.2.4             narām[loved one]N       na-ram   narām  narāmu   
 9519  Q003414.2.5                             u    {d}[(...)                   
 
               sense pos          unicode unicode_word      reading  \
 9515      offspring   N            ['𒉭']            𒉭        NUNUZ   
 9516          Aššur  QN  ['𒁄', '𒌀', '𒆠']          𒁄𒌀𒆠  bal-til{KI}   
 9517  very valuable  AJ  ['𒋗', '𒄣', '𒊒']          𒋗𒄣𒊒   šu-⸢qu-ru⸣   
 9518      loved one   N       ['𒈾', '𒉘']           𒈾𒉘       na-ram   
 9519                  u       ['𒀭', 'x']           𒀭x       {d}[x]   
 
                                      break  break_perc mask lang     text  \
 9515 

In [7]:
def df2str(df, column, break_perc=1, mask=True, segmentation=True):

    # check if column exists in dataframe. If not, return empty text.
    if column not in df.columns:
        return ("", 0, 0)
    else:
        # remove rows that include duplicate values for compound words
        if column not in ["norm", "cf", "sense", "pos"]:
            df = df.drop_duplicates("ref").copy()
        # if column entry is empty string, replace with UNK (can happen with normalization or lemmatization)
        mask_empty = df[column]==""
        df[column] = df[column].where(~mask_empty, other="UNK")
        # mask proper nouns
        if mask and "pos" in df.columns:
            mask_bool = df["pos"].isin(["PN", "RN", "DN", "GN", "MN", "SN", "n"])
            df[column] = df[column].where(~mask_bool, other=df["pos"])
        # change number masking from `n` to `NUM`
        # !comment out for Egyptian
        if mask:
            mask_num = df[column]=="n"
            df[column] = df[column].where(~mask_num, other="NUM")
        # remove rows without break_perc (happens with non-Akkadian words)
        if "" in df["break_perc"].unique():
            df = df[df["break_perc"]!=""].copy()
        # filter according to break_perc
        mask_break = df["break_perc"] <= break_perc
        df[column] = df[column].where(mask_break, other="X")
        # calculate text length with and without UNK and x tokens
        text_length_full = df.shape[0]
        mask_partial = df[column].isin(["UNK", "X", "x"])
        text_length_partial = text_length_full - sum(mask_partial)
        # create text lines
        text = ""
        df_lines = split_df_by_column_value(df, "line")
        for line in df_lines:
            word_list = list(filter(None, line[column].to_list()))
            if word_list != []:
                text += " ".join(map(str, word_list)).replace("x", "X").strip() + "\n"

        # Why is this here?
        if segmentation is False:
            # remove all white spaces (word segmentation and line breaks)
            text = re.sub(r"[\s\u00A0]+", "", text)

        return (text, text_length_full, text_length_partial)

In [8]:
df2str(corpus[0], "cf")

('perʾu Baltil šūquru narāmu UNK DN UNK UNK UNK UNK\npitqu DN ša ana bēlūtu mātu UNK UNK UNK UNK UNK UNK\nrabû ana šarrūtu šakkanakku UNK UNK UNK UNK UNK UNK\nmuṣṣibu šagigurrû ana UNK UNK UNK UNK UNK UNK UNK UNK šurīnu\nzikaru dannu nūru kiššatu nišu etellu UNK kalû malku UNK UNK UNK\ndāʾipu gērû eṭlu qardu sāpinu UNK nakru ša huršānu\netguru kīma qû salātu UNK UNK UNK UNK UNK UNK\n',
 75,
 39)

In [9]:
# Adding Named Entities
def get_lemmatized_texts(corpus, break_perc=0.6, mask=False):

    texts_dict = {}
    for df in corpus:
        # get the text number from the dataframe "text" column
        key = df["text"].iloc[0]
        text, text_length_full, text_length_partial = df2str(df, "cf", break_perc, mask)
        texts_dict[key] = (text, text_length_full, text_length_partial)
    return texts_dict

In [10]:
get_lemmatized_texts((split_df_by_column_value(corpus[0], "text")))

{'Q003414': ('perʾu Baltil šūquru narāmu UNK DN UNK UNK UNK UNK\npitqu DN ša ana bēlūtu mātu X UNK UNK UNK UNK UNK\nrabû ana šarrūtu šakkanakku UNK X UNK UNK UNK UNK\nmuṣṣibu šagigurrû ana UNK UNK UNK UNK UNK X UNK X šurīnu\nzikaru dannu nūru kiššatu nišu etellu X X malku UNK UNK UNK\ndāʾipu gērû eṭlu qardu sāpinu X nakru ša huršānu\netguru kīma qû salātu X UNK UNK UNK UNK UNK\n',
  75,
  38)}

In [11]:
def get_normalized_texts(corpus, break_perc=1, mask=True):

    texts_dict = {}
    for df in corpus:
        # get the text number from the dataframe "text" column
        key = df["text"].iloc[0]
        text, text_length_full, text_length_partial = df2str(df, "norm", break_perc, mask)
        texts_dict[key] = (text, text_length_full, text_length_partial)
    return texts_dict

In [12]:
get_normalized_texts((split_df_by_column_value(corpus[0], "text")))

{'Q003414': ('perʾi Baltil šūquru narām UNK DN UNK UNK UNK UNK\npitiq DN ša ana bēlūt mātāti UNK UNK UNK UNK UNK UNK\nirbû ana šarrūti šakkanakku UNK UNK UNK UNK UNK UNK\nmuṣṣib šagigurê ana UNK UNK UNK UNK UNK UNK UNK UNK šurinnī\nzikaru dannu nūr kiššat nišīšu etel UNK kal malkī UNK UNK UNK\ndāʾipu gārêšu eṭlu qardu sāpinu UNK nakiri ša hursānī\netgurūti kīma qê usallituma UNK UNK UNK UNK UNK UNK\n',
  75,
  39)}

**Function to convert the dataframes into strings of segmented unicode texts**.
* param corpus: a list of dataframes
* param break_perc: a parameter which dictates whether to include broken words depending on the percentage of how broken they are.
                       Compares this value to the `break_perc` column in the dataframe.
                       Parameter is set to 1 (i.e. all words, whether broken or not, are included); can be any float between 0 and 1.
* param mask: boolean whether to mask named entities or not; set to True.
* return: a dictionary where the keys are the text IDs and the values are the segmented unicode texts

In [13]:
def get_segmented_unicode_texts(corpus, break_perc=1, mask=True):

    texts_dict = {}
    for df in corpus:
        # get the text number from the dataframe "text" column
        key = df["text"].iloc[0]
        text, text_length_full, text_length_partial = df2str(df, "unicode_word", break_perc, mask)
        texts_dict[key] = (text, text_length_full, text_length_partial)
    return texts_dict

get_segmented_unicode_texts((split_df_by_column_value(corpus[0], "text")))

{'Q003414': ('𒉭 𒁄𒌀𒆠 𒋗𒄣𒊒 𒈾𒉘 𒀭X DN 𒀊 𒁀 X X\n𒉿𒄘 DN 𒃻 𒀀𒈾 𒁁𒂁 𒆳𒆳 X 𒀭 𒋾 𒀠 𒋃 X\n𒅕𒁍𒌑 𒀀𒈾 𒈗𒌑𒋾 𒄊𒀴 X X X X X 𒈠\n𒈬𒍦 𒊮𒅆𒃸𒎌 𒀀𒈾 X X X X X X X X 𒋗𒊑𒅔𒉌\n𒍣𒅗𒊒 𒆗𒉡 𒉡𒌨 𒆧𒆳 𒌦𒎌𒋗 𒂊𒌀 X 𒆗 𒂷𒆠 X X 𒋾\n𒁕𒄿𒁍 𒂵𒊑𒂊𒋗 𒄨 𒃼𒁺 𒊓𒉿𒉡 X 𒈾𒆠𒊑 𒃻 𒄯𒊓𒀀𒉌\n𒀉𒄖𒊒𒋾 𒆠𒈠 𒆠𒂊 𒌑𒊩𒇷𒌅𒈠 𒌑X 𒌑 X X X X\n',
  75,
  50)}

In [14]:
def vectorize(corpus, analyzer="word", ngram_range=(1,1), max_df=1.0, min_df=1, max_features=None, stop_words=["UNK", "X"]):

    vectorizer = TfidfVectorizer(
         input="content", lowercase=False, analyzer=analyzer,
         # RegEx for Akkadian
         token_pattern=r"(?u)\b\w+\b",
         ngram_range=ngram_range,
         max_df=max_df,
         min_df=min_df,
         max_features=max_features,
         stop_words=stop_words)

    counts = vectorizer.fit_transform(corpus["text"].tolist()).toarray()
    #stop_words = vectorizer.stop_words_
    stop_words = getattr(vectorizer, 'stop_words_', None)

    # saving the vocab used for vectorization, and switching the dictionary so that the feature index is the key
    vocab = vectorizer.vocabulary_
    switched_vocab = {value: key for key, value in vocab.items()}
    # adding the vocab words to the counts dataframe for easier viewing.
    column_names = []
    x = 0
    while x < len(switched_vocab):
        column_names.append(switched_vocab[x])
        x += 1

    counts_df = pd.DataFrame(counts, index=corpus.index, columns=column_names)

    return (counts, counts_df, stop_words)


In [15]:
vectorize(corpus[0], analyzer="word")

(array([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
      

#### calculating distances between vectorized documents

In [16]:
def distance_calculator(counts, metric, text_ids):

    return pd.DataFrame(squareform(pdist(counts, metric=metric)), index=text_ids, columns=text_ids)

In [17]:
def reduce_dimensions_pca(df, metadata, col1, col2):

    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(df)
    reduced_df = pd.DataFrame(data=reduced_data, index=df.index, columns=[col1, col2 ])
    reduced_df_metadata = metadata.join(reduced_df)
    return reduced_df_metadata

In [18]:
def reduce_dimensions_tsne(df, perplexity, n_iter, metric, metadata):

    tsne = TSNE(n_components=2, perplexity=perplexity, n_iter=n_iter, metric=metric, init="pca")
    reduced_data = tsne.fit_transform(df)
    reduced_df = pd.DataFrame(data=reduced_data, index=df.index, columns=["component 1", "component 2"])
    reduced_df_metadata = metadata.join(reduced_df)
    return reduced_df_metadata

In [19]:
# Function to combine processed texts with metadata

def get_corpus_metadata(texts_dict, metadata):
  texts_df = pd.DataFrame(texts_dict, index=["text", "full_length", "partial_length"]).transpose()
  df = metadata.join(texts_df)
  return df

In [20]:
## vectorize lemma forms - das ist die Ur-Einstellung

corpus_dict = get_lemmatized_texts(corpus, break_perc=0)
## vectorize normalized forms
corpus_dict2 = get_normalized_texts(corpus, break_perc=0)
## vectorize Unicode cuneiform
corpus_dict3 = get_segmented_unicode_texts(corpus, break_perc=0)

corpus_metadata = get_corpus_metadata(corpus_dict, metadata)
corpus_metadata2 = get_corpus_metadata(corpus_dict2, metadata)
corpus_metadata3 = get_corpus_metadata(corpus_dict3, metadata)

## For Akkadian
## remove texts which have less than n words excluding UNK and X
# n = 10
n = 50
print(f"Lemmatized: Number of texts before filtering: {corpus_metadata.shape[0]}")
print(f"Normalized: Number of texts before filtering: {corpus_metadata2.shape[0]}")
print(f"Unicode: Number of texts before filtering: {corpus_metadata3.shape[0]}")

corpus_metadata = corpus_metadata[corpus_metadata["partial_length"]>=n]
corpus_metadata2 = corpus_metadata2[corpus_metadata2["partial_length"]>=n]
corpus_metadata3 = corpus_metadata3[corpus_metadata3["partial_length"]>=n]
print(f"Number of texts after filtering: {corpus_metadata.shape[0]}")
print(f"Number of texts after filtering: {corpus_metadata2.shape[0]}")
print(f"Number of texts after filtering: {corpus_metadata3.shape[0]}")


# For Egyptian use this instead, resetting the index
#n = 10
#print(f"Number of texts before filtering: {corpus_metadata.shape[0]}")
#corpus_metadata = corpus_metadata[corpus_metadata["partial_length"]>=n].set_index("popular_name")
#print(f"Number of texts after filtering: {corpus_metadata.shape[0]}")

Lemmatized: Number of texts before filtering: 432
Normalized: Number of texts before filtering: 432
Unicode: Number of texts before filtering: 432
Number of texts after filtering: 135
Number of texts after filtering: 134
Number of texts after filtering: 140


In [21]:
# vectorize corpus
counts, counts_df, stop_words = vectorize(corpus_metadata, max_features=200)
counts2, counts_df2, stop_words2 = vectorize(corpus_metadata2, max_features=200)
counts3, counts_df3, stop_words3 = vectorize(corpus_metadata3, max_features=200)

In [22]:
from plotly.subplots import make_subplots

def make_fig_3hm(fig1, fig2, fig3, title):
    fig_hm = make_subplots (rows=1, cols=3, subplot_titles=("Lemma", "Normalized", "Unicode"))
    for trace in fig1.data:
        fig_hm.add_trace(trace, row=1, col=1)
    
    for trace in fig2.data:
        fig_hm.add_trace(trace, row=1, col=2)
    
    for trace in fig3.data:
        fig_hm.add_trace(trace, row=1, col=3)
    
    fig_hm.update_layout(
        autosize=False,
        width=1500,
        height=500,
        title_text=title,
    )

    fig_hm.update_xaxes(showticklabels=False)
    fig_hm.update_yaxes(showticklabels=False)
    return fig_hm

In [31]:
# Euclidean Heatmaps:
fig1 = px.imshow(distance_calculator(counts, "euclidean", corpus_metadata.index))
fig2 = px.imshow(distance_calculator(counts2, "euclidean", corpus_metadata2.index))
fig3 = px.imshow(distance_calculator(counts3, "euclidean", corpus_metadata3.index))

# Cosine Heatmaps:
fig_lemm_co = px.imshow(distance_calculator(counts, "cosine", corpus_metadata.index))
fig_norm_co = px.imshow(distance_calculator(counts2, "cosine", corpus_metadata2.index))
fig_unic_co = px.imshow(distance_calculator(counts3, "cosine", corpus_metadata3.index))

# Cityblock Heatmaps: 
fig_lemm_ci = px.imshow(distance_calculator(counts, "cityblock", corpus_metadata.index))
fig_norm_ci = px.imshow(distance_calculator(counts2, "cityblock", corpus_metadata2.index))
fig_unic_ci = px.imshow(distance_calculator(counts3, "cityblock", corpus_metadata3.index))


# Building the Comparisons: 
fig_eu_hm=make_fig_3hm(fig1, fig2, fig3, title="Comparing Similarity Matrix Heatmaps Rinap1+5: Euclidean")
fig_eu_hm.show()

fig_co_hm=make_fig_3hm(fig_lemm_co, fig_norm_co, fig_unic_co, title="Comparing Similarity Matrix Heatmaps Rinap1+5: Cosine")
fig_co_hm.show()

fig_ci_hm=make_fig_3hm(fig_lemm_ci, fig_norm_ci, fig_unic_ci, title="Comparing Similarity Matrix Heatmaps Rinap1+5: Cityblock")
fig_ci_hm.show()

In [28]:
# From the endless archives of 'This should have been a function':

# reduce matrix dimensions euclidean:
def reduce_tsne (counts, metadata, metric1, metric2,):
   matrix = distance_calculator(counts, metric2, metadata.index)
   result = reduce_dimensions_tsne(matrix, perplexity=matrix.shape[0]**0.5, n_iter=5000, metric=metric1, metadata=metadata)
   return result

reduced_tsne_lem = reduce_tsne(counts, corpus_metadata, "euclidean", "euclidean")
reduced_tsne_norm = reduce_tsne(counts2, corpus_metadata2, "euclidean", "euclidean")
reduced_tsne_unic = reduce_tsne(counts3, corpus_metadata3, "euclidean", "euclidean")


# reduced_tsne_lem = reduce_dimensions_tsne(distance_calculator(counts, "euclidean", corpus_metadata.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="euclidean", metadata=corpus_metadata)
# reduced_tsne_norm = reduce_dimensions_tsne(distance_calculator(counts2, "euclidean", corpus_metadata2.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="euclidean", metadata=corpus_metadata2)
# reduced_tsne_unic = reduce_dimensions_tsne(distance_calculator(counts3, "euclidean", corpus_metadata3.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="euclidean", metadata=corpus_metadata3)



# reduce matrix dimensions euclidean:
reduced_tsne_lem_co = reduce_tsne (counts, corpus_metadata, "cosine", "cosine")
reduced_tsne_norm_co = reduce_tsne (counts2, corpus_metadata2, "cosine", "cosine")
reduced_tsne_unic_co = reduce_tsne (counts3, corpus_metadata3, "cosine", "cosine")

# reduced_tsne_lem_co = reduce_dimensions_tsne(distance_calculator(counts, "cosine", corpus_metadata.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="cosine", metadata=corpus_metadata)
# reduced_tsne_norm_co = reduce_dimensions_tsne(distance_calculator(counts2, "cosine", corpus_metadata2.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="cosine", metadata=corpus_metadata2)
# reduced_tsne_unic_co = reduce_dimensions_tsne(distance_calculator(counts3, "cosine", corpus_metadata3.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="cosine", metadata=corpus_metadata3)

# reduce matrix dimensions cityblock:
reduced_tsne_lem_ci = reduce_tsne (counts, corpus_metadata, "cityblock", "cityblock")
reduced_tsne_norm_ci = reduce_tsne (counts2, corpus_metadata2, "cityblock", "cityblock")
reduced_tsne_unic_ci = reduce_tsne (counts3, corpus_metadata3, "cityblock", "cityblock")

# reduced_tsne_lem_ci = reduce_dimensions_tsne(distance_calculator(counts, "cityblock", corpus_metadata.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="cityblock", metadata=corpus_metadata)
# reduced_tsne_norm_ci = reduce_dimensions_tsne(distance_calculator(counts2, "cityblock", corpus_metadata2.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="cityblock", metadata=corpus_metadata2)
# reduced_tsne_unic_ci = reduce_dimensions_tsne(distance_calculator(counts3, "cityblock", corpus_metadata3.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="cityblock", metadata=corpus_metadata3)

# TSNE euclidean + distance cosine
reduced_tsne_lem_mi = reduce_tsne (counts, corpus_metadata, "euclidean", "cosine")
reduced_tsne_norm_mi = reduce_tsne (counts2, corpus_metadata2, "euclidean", "cosine")
reduced_tsne_unic_mi = reduce_tsne (counts3, corpus_metadata3, "euclidean", "cosine")

# reduced_tsne_lem_mi = reduce_dimensions_tsne(distance_calculator(counts, "cosine", corpus_metadata.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="euclidean", metadata=corpus_metadata)
# reduced_tsne_norm_mi = reduce_dimensions_tsne(distance_calculator(counts2, "cosine", corpus_metadata2.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="euclidean", metadata=corpus_metadata2)
# reduced_tsne_unic_mi = reduce_dimensions_tsne(distance_calculator(counts3, "cosine", corpus_metadata3.index), perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="euclidean", metadata=corpus_metadata3)

# Reducing Dimensions with PCA df, metadata, col1 , col2 
red_pca_lem = reduce_dimensions_pca(distance_calculator(counts, "euclidean", corpus_metadata.index), corpus_metadata, "component 1", "component 2")
red_pca_norm = reduce_dimensions_pca(distance_calculator(counts2, "euclidean", corpus_metadata2.index), corpus_metadata2, "component 1", "component 2")
red_pca_unic = reduce_dimensions_pca(distance_calculator(counts3, "euclidean", corpus_metadata3.index), corpus_metadata3, "component 1", "component 2")


In [30]:
def make_fig_3px(df1, df2, df3, title):

    size_min = 4
    size_max = 100
    size1 = (df1["partial_length"] / df1["partial_length"].max() * (size_max - size_min) + size_min).tolist()
    size2 = (df2["partial_length"] / df2["partial_length"].max() * (size_max - size_min) + size_min).tolist()
    size3 = (df3["partial_length"] / df3["partial_length"].max() * (size_max - size_min) + size_min).tolist()

    fig1 = px.scatter(df1, x="component 1",y="component 2", size=size1, color="project", symbol="script", hover_data=["partial_length", "full_length", df1.index])
    fig1.update_traces(marker=dict(line=dict(width=1, color='black')))
    fig2 = px.scatter(df2, x="component 1",y="component 2", size=size2, color="project", symbol="script", hover_data=["partial_length", "full_length", df2.index])
    fig2.update_traces(marker=dict(line=dict(width=1, color='black')))
    fig3 = px.scatter(df3, x="component 1",y="component 2", size=size3, color="project", symbol="script", hover_data=["partial_length", "full_length", df3.index])
    fig3.update_traces(marker=dict(line=dict(width=1, color='black')))

    fig_px = make_subplots (rows=3, cols=1 , subplot_titles=("Lemma", "Normalized", "Unicode"))

    for trace in fig1.data:
        fig_px.add_trace(trace, row=1, col=1)
    for trace in fig2.data:
        fig_px.add_trace(trace, row=2, col=1)
    for trace in fig3.data:
        fig_px.add_trace(trace, row=3, col=1)
    
    fig_px.update_layout(
        autosize=False,
        width=1500,
        height=1500,
        title_text=title,
    )
    return fig_px

fig_comp_red_tsne_euclid = make_fig_3px(reduced_tsne_lem, reduced_tsne_norm, reduced_tsne_unic, title="TSNE euclidean + distance_calculator euclidean")
fig_comp_red_tsne_euclid.show()

fig_com_red_tsne_cosin = make_fig_3px(reduced_tsne_lem_co, reduced_tsne_norm_co, reduced_tsne_unic_co, title="TSNE cosine + distance_calculator cosine")
fig_com_red_tsne_cosin.show()

fig_com_red_tsne_city = make_fig_3px(reduced_tsne_lem_ci, reduced_tsne_norm_ci, reduced_tsne_unic_ci, title="TSNE cityblock + distance_calculator cityblock")
fig_com_red_tsne_city.show()

fig_com_red_tsne_mix = make_fig_3px(reduced_tsne_lem_mi, reduced_tsne_norm_mi, reduced_tsne_unic_mi, title="TSNE euclidean + distance_calculator cosine (as in the example)")
fig_com_red_tsne_mix.show()

fig_com_red_pca = make_fig_3px(red_pca_lem, red_pca_norm, red_pca_unic, title="PCA + distance_calculator euclidean")
fig_com_red_pca.show()

# Question: 
In the `distance_calculator` function we pass a `metric` argument. In the example it is *cosine*. In the `reduce_dimensions_tsne` we also pass a `metric` argument. In the example notebook it is *euclidean*. As we are using the *matrix* resulting from `distance_calculator`, does it matter which metric we are using with TSNE? How do these two relate?


From the example: 
```python
# calculate distance between vectorized texts
matrix = distance_calculator(counts, "cosine", corpus_metadata.index)
```    
and:
```python
# reduce matrix dimensions
reduced_tsne = reduce_dimensions_tsne(matrix, perplexity=matrix.shape[0]**0.5, n_iter=5000, metric="euclidean", metadata=corpus_metadata)
```