In [7]:
import os
import sqlite3
import tempfile

from txtai.pipeline import Tokenizer
from txtai.vectors import WordVectors

print("Streaming tokens to temporary file")

# Stream tokens to temp working file
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as output:
  # Save file path
  tokens = output.name

  db_dir = os.path.abspath("/home/ematos/devel/kardec/kardec_laravel/database")
  database_file = os.path.join(db_dir, "dataset.txtai.sqlite3")

  db = sqlite3.connect(database_file)
  cur = db.cursor()
  cur.execute("SELECT sentence from sentence")

  for row in cur:
    output.write(" ".join(Tokenizer.tokenize(row[0])) + "\n")

  # Free database resources
  db.close()

# Build word vectors model - 300 dimensions, 3 min occurrences
WordVectors.build(tokens, 300, 3, "pk-dataset")

# Remove temporary tokens file
os.remove(tokens)

# Show files
!ls -l

Streaming tokens to temporary file


Read 0M words
Number of words:  2748
Number of labels: 0
Progress: 100.0% words/sec/thread:   45391 lr:  0.000000 avg.loss:  2.730300 ETA:   0h 0m 0s


total 14160
-rw-r--r-- 1 ematos ematos 4685824 set  7 17:39 pk-dataset.magnitude
-rw-r--r-- 1 ematos ematos 9808984 set  7 17:39 pk-dataset.txt
-rw-r--r-- 1 ematos ematos    2223 set  7 17:38 test_sqlite.ipynb


In [9]:
import sqlite3

import regex as re

from txtai.embeddings import Embeddings
from txtai.pipeline import Tokenizer

def stream():
  # Connection to database file
  db_dir = os.path.abspath("/home/ematos/devel/kardec/kardec_laravel/database")
  database_file = os.path.join(db_dir, "dataset.txtai.sqlite3")

  db = sqlite3.connect(database_file)
  cur = db.cursor()

  # Select tagged sentences without a NLP label. NLP labels are set for non-informative sentences.
  cur.execute("SELECT idSentence,idItem,sentence from sentence")

  count = 0
  for row in cur:
    # Unpack row
    uid, name, text = row

    # Only process certain document sections
    # if not name or not re.search(r"background|(?<!.*?results.*?)discussion|introduction|reference", name.lower()):
      # Tokenize text
    tokens = Tokenizer.tokenize(text)

    document = (uid, tokens, None)

    count += 1
    if count % 1000 == 0:
      print("Streamed %d documents" % (count), end="\r")

    # Skip documents with no tokens parsed
    if tokens:
      yield document

  print("Iterated over %d total rows" % (count))

  # Free database resources
  db.close()

# BM25 + fastText vectors
embeddings = Embeddings({"path": "pk-dataset.magnitude",
                         "scoring": "bm25",
                         "pca": 3})

# Build scoring index if scoring method provided
if embeddings.config.get("scoring"):
  embeddings.score(stream())

# Build embeddings index
embeddings.index(stream())


Iterated over 4900 total rows
Iterated over 4900 total rows


In [22]:
import pandas as pd

from IPython.display import display, HTML

pd.set_option("display.max_colwidth", None)

db_dir = os.path.abspath("/home/ematos/devel/kardec/kardec_laravel/database")
database_file = os.path.join(db_dir, "dataset.txtai.sqlite3")

db = sqlite3.connect(database_file)

cur = db.cursor()

results = []
for uid, score in embeddings.search("intelectuais que não sabem conversar", 10):

  print(uid)
    
  cur.execute("SELECT idSentence, sentence FROM sentence WHERE idSentence = ?", [uid])
  uid, text = cur.fetchone()

  cur.execute("SELECT idSentence, idItem from sentence where idSentence = ?", [uid])
  results.append(cur.fetchone() + (text,))

# Free database resources
db.close()

df = pd.DataFrame(results, columns=["idSentence","idItem", "sentence"])

# It has been reported that displaying HTML within VSCode doesn't work.
# When using VSCode, the data can be exported to an external HTML file to view.
# See example below.

# htmlData = df.to_html(index=False)
# with open("data.html", "w") as file:
#     file.write(htmlData)

display(HTML(df.to_html(index=False)))

4628
3602
644
657
3599
3601
3833
3600
379
2505


idSentence,idItem,sentence
4628,284,AK.
3602,241,"Haverá outra às 16h10, que chegará às 22h15."
644,81,Ao Sr.
657,82,"Paris, 17 de Fevereiro de 1863 Sf."
3599,241,"Há uma às 7h50 que chega a Paris às 15h45, e outra às 18h que chega a Paris às 5h15."
3601,241,"Pelo trem de correios, a partir de 15 de maio, a partida será às 6h55 e chegará a Paris às 12h20."
3833,250,Sic.
3600,241,"A partir de 15 de maio, o horário será modificado da seguinte forma: primeira partida de Tours às 8h30, chegada a Paris às 16h20; segunda partida às 20h15, chegada a Paris às 4h20."
379,59,Senhor Houat.
2505,187,"Parto decididamente na quarta-feira no comboio das 10h25 da manhã, que chega em Paris à noite, às 18h20."
