## Analysis of the Gospels

We well analyze parallelism in two of the Gospels. This time, using [Chroma](https://www.trychroma.com/).

See [this](http://datanalytics.com/2023/10/31/paralelismo-embeddings/) for additional details.

In [None]:
import os
import chromadb
import torch
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from langchain.document_loaders import UnstructuredMarkdownLoader

In [None]:
base_path = '/home/carlos/staging/src/huggingface/texts'
client = chromadb.PersistentClient(path="./blog_embeddings_00.chromadb")

In [None]:
try:
        client.delete_collection(name="evangelios")
except:
        pass

collection = client.create_collection(
        name='evangelios',
        metadata={'hnsw:space': 'cosine'})

In [None]:
my_files = os.listdir(base_path)

for my_file in my_files:
    print(my_file)
    full_path = os.path.join(base_path, my_file)
    loader = UnstructuredMarkdownLoader(full_path, mode="elements")
    data = loader.load()
    sentences = [x.page_content for x in data]
    ids = ['_'.join((my_file, str(a))) for a, _ in enumerate(data)]
    metadata = [{'source': my_file}] * len(ids)
    collection.add(
        documents=sentences,
        metadatas=metadata,
        ids=ids
    )

In [None]:
mateo = collection.get(
    where={"source": "mateo.txt"}
)

marcos = collection.get(
    where={"source": "marcos.txt"}
)

In [None]:
sentences_mateo = mateo['documents']

mateo_res = {}

for i in range(len(sentences_mateo)):
    #print(i)
    res = collection.query(
        query_texts=[sentences_mateo[i]],
        n_results=10,
        where={"source": "marcos.txt"}
    )
    ids = res['ids']
    nums = [int(x.split('_')[1]) for x in ids[0]]
    mateo_res[i] = nums



In [None]:
sentences_marcos = marcos['documents']

marcos_res = {}

for i in range(len(sentences_marcos)):
    #print(i)
    res = collection.query(
        query_texts=[sentences_marcos[i]],
        n_results=10,
        where={"source": "mateo.txt"}
    )
    ids = res['ids']
    nums = [int(x.split('_')[1]) for x in ids[0]]
    marcos_res[i] = nums


In [None]:
tmp_mateo = pd.DataFrame.from_dict(mateo_res).T
tmp_mateo = tmp_mateo.reset_index()
tmp_mateo = tmp_mateo.melt(id_vars='index')
tmp_mateo.columns = ['mateo_id', 'pos', 'marcos_id']
tmp_mateo.head()

In [None]:
tmp_marcos = pd.DataFrame.from_dict(marcos_res).T
tmp_marcos = tmp_marcos.reset_index()
tmp_marcos = tmp_marcos.melt(id_vars='index')
tmp_marcos.columns = ['marcos_id', 'pos', 'mateo_id']
tmp_marcos = tmp_marcos[['mateo_id', 'pos', 'marcos_id']]
tmp_marcos.head()

In [None]:
tmp = pd.concat([tmp_mateo, tmp_marcos])
tmp.head()

In [None]:
m = np.zeros((tmp.mateo_id.max() + 1, tmp.marcos_id.max() + 1), dtype=int)
for i, j in zip(tmp.mateo_id, tmp.marcos_id):
    m[i, j] = 1

In [None]:
plt.imshow(m.T, cmap='binary', interpolation='none');