In [8]:
from PIL import Image

import pytesseract
import glob, os
from pprint import pprint
import requests

from dotenv import load_dotenv
load_dotenv()


True

In [20]:

# Read my zsh history into an array
with open(os.getenv("HOME") + "/.zsh_history") as f:
    history = f.readlines()

# Remove the timestamp
history = [line.split(";")[-1].strip() for line in history]
# history = [line.split(";")[1] for line in history]

# Remove duplicates
history = list(set(history))


# Remove the newline character
display(history[0:10])


['',
 'asdf install plugin ruby',
 'gnb slower-video',
 'cd test-failed-collections',
 'echo $FASTLANE_USER',
 'yarn up:prod -y',
 'rm test/types.d.ts',
 'brew install weaveworks/tap/eksctl',
 'git diff src/ws',
 'conda init zsh']

In [47]:
# Find similar commands using the Hugging Face Inference API
token = os.environ['HUGGING_FACE_TOKEN']

API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
headers = {"Authorization": f"Bearer {token}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

def best_matches_remote(command):
    output = query({
      "inputs": {
        "source_sentence": command,
        "sentences": history
      },
    })
    if ('error' in output):
      raise Exception(output['error'])

    scored_entries = list(zip(history, output))
    top_matches = sorted(scored_entries, key=lambda x: x[1], reverse=True)[:10]

    print(f"\nTop matches for command `{command}`:")
    for command, probability in top_matches:
      print(f"\t{command.ljust(30)}: {probability:.2f}")

    return sorted(scored_entries, key=lambda x: x[1], reverse=True)

best_matches_remote("test collections resolver")


Top matches for command `test collections resolver`:
	open test/data/test-failed-collections: 0.56
	./bin/ctest --watchAll --testPathPattern map.resolver: 0.55
	./bin/ctest --watchAll --testPathPattern map.resolver.e2e: 0.52
	./bin/ctest --watchAll --testPathPattern my-blocks.resolver: 0.51
	./bin/ctest --watchAll --testPathPattern map-resolver.e2e: 0.51
	cd test-failed-collections    : 0.50
	./bin/ctest --watchAll --testPathPattern /collection.entity: 0.50
	./bin/ctest --watchAll --testPathPattern upgrade-block.resolver: 0.49
	git checkout -b test-failing-collections: 0.41
	./bin/cyarn command test-failed-collections: 0.40


[('open test/data/test-failed-collections', 0.557887852191925),
 ('./bin/ctest --watchAll --testPathPattern map.resolver', 0.5521204471588135),
 ('./bin/ctest --watchAll --testPathPattern map.resolver.e2e',
  0.5180151462554932),
 ('./bin/ctest --watchAll --testPathPattern my-blocks.resolver',
  0.5114028453826904),
 ('./bin/ctest --watchAll --testPathPattern map-resolver.e2e',
  0.5093898177146912),
 ('cd test-failed-collections', 0.5004050731658936),
 ('./bin/ctest --watchAll --testPathPattern /collection.entity',
  0.49936869740486145),
 ('./bin/ctest --watchAll --testPathPattern upgrade-block.resolver',
  0.4852907657623291),
 ('git checkout -b test-failing-collections', 0.40675827860832214),
 ('./bin/cyarn command test-failed-collections', 0.3983614444732666),
 ('find src/ | entr -s "./bin/cyarn command test-failed-collections"',
  0.3666009306907654),
 ('rm test/types.d.ts', 0.36168211698532104),
 ('cd docquery-tests', 0.34414801001548767),
 ('git checkout src/map/map.resolver.ts

In [34]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed the history locally using the model
history_embeddings = model.encode(history, convert_to_tensor=True)


In [68]:
# Find the closest matches based on cosine similarity
def best_matches_local(command):
    command_embedding = model.encode([command], convert_to_tensor=True)
    cosine_scores = util.cos_sim(history_embeddings, command_embedding)

    scored_entries = list(zip(history, [float(s[0]) for s in cosine_scores]))
    scored_entries = sorted(scored_entries, key=lambda x: x[1], reverse=True)

    return scored_entries[0:10]


best_matches_local("test collections resolver")


[('open test/data/test-failed-collections', 0.5578877329826355),
 ('./bin/ctest --watchAll --testPathPattern map.resolver', 0.5521202683448792),
 ('./bin/ctest --watchAll --testPathPattern map.resolver.e2e',
  0.5180152654647827),
 ('./bin/ctest --watchAll --testPathPattern my-blocks.resolver',
  0.5114029049873352),
 ('./bin/ctest --watchAll --testPathPattern map-resolver.e2e',
  0.5093897581100464),
 ('cd test-failed-collections', 0.5004051327705383),
 ('./bin/ctest --watchAll --testPathPattern /collection.entity',
  0.499368816614151),
 ('./bin/ctest --watchAll --testPathPattern upgrade-block.resolver',
  0.48529064655303955),
 ('git checkout -b test-failing-collections', 0.40675830841064453),
 ('./bin/cyarn command test-failed-collections', 0.3983615040779114)]

In [66]:

# Just for fun let's try indexing with FAISS

import faiss
import numpy as np

# Index the history
index = faiss.IndexIDMap(faiss.IndexFlatIP(384))

display(model.encode(history).shape)

index.add_with_ids(model.encode(history), np.array(range(0, len(history_embeddings))))

def best_matches_faiss(command):
    command_embedding = model.encode([command])

    top_entries = index.search(command_embedding, 10)

    return [history[i] for i in top_entries[1][0]]

best_matches_faiss("test collections resolver")

(1368, 384)

['open test/data/test-failed-collections',
 './bin/ctest --watchAll --testPathPattern map.resolver',
 './bin/ctest --watchAll --testPathPattern map.resolver.e2e',
 './bin/ctest --watchAll --testPathPattern my-blocks.resolver',
 './bin/ctest --watchAll --testPathPattern map-resolver.e2e',
 'cd test-failed-collections',
 './bin/ctest --watchAll --testPathPattern /collection.entity',
 './bin/ctest --watchAll --testPathPattern upgrade-block.resolver',
 'git checkout -b test-failing-collections',
 './bin/cyarn command test-failed-collections']

In [67]:
# Benchmark performance of best_matches_local vs best_matches_faiss

import time

def benchmark(command):
    start = time.time()
    best_matches_local(command)
    end = time.time()
    print(f"Local search took {end - start} seconds")

    start = time.time()
    best_matches_faiss(command)
    end = time.time()
    print(f"FAISS search took {end - start} seconds")

benchmark("git commit -m")


Top matches for command `git commit -m`:


['git commit',
 'git commit -m "release"',
 'git commit -a',
 'git commit -m "0.1.1"',
 'git commit -m "0.1.0"',
 'git commit -m "correct provider"',
 'git commit -m "v0.1.3"',
 'git commit -m "final changes before cutover"',
 'git commit -m "Check the ToS whenever"',
 'git commit -am "more cleanup"']

Local search took 0.016939163208007812 seconds
FAISS search took 0.01026606559753418 seconds
