In [1]:
!pip install faiss-cpu
!pip install transformers
!pip install sentence-transformers
!pip install python-docx

import json
import torch
import numpy as np
import faiss
import sentence_transformers
from sentence_transformers import SentenceTransformer
import pandas as pd
import docx
import re
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 18.8 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 17.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |██████████████████

In [73]:
class bert_index:

  def __init__(self, model_name):
    self.model_name = model_name
    self.model = SentenceTransformer(model_name)

  def compile_path(self, root_path):
    path_list = []
    for root, dirs, files in os.walk(root_path):
      for name in files:
        path_list.append(os.path.join(root, name))
    path_list = list(filter(lambda x: x.endswith('.docx'), path_list))
    return path_list
  
  def import_corpus(self, path_list):
    doc_list = []
    for file_path in path_list:
      doc_list.append([para.text for para in docx.Document(file_path).paragraphs])
    return doc_list
  

  def process_corpus(self, doc_list):
    for index, document in enumerate(doc_list):
      sentences = []
      for para in document:
        para = " ".join(para.split())
        for i in bert_index.split_sentences(para):
          if i != '':
            sentences.append(i)
      doc_list[index] = [i for i in sentences if len(i) > 0]
    return doc_list
  
  def split_sentences(para):
    for delim in ['?', '!', ';']:
      para = para.replace(delim, '.')
    sentences = para.split('.')
    sentences = [x.strip() for x in sentences]
    sentences = list(filter(lambda x: len(x) > 3, sentences))
    return sentences

  def create_index(self, doc_list, path_list):
    corpus_index = []
    corpus_embeddings = []
    #model = SentenceTransformer(self.model_name)
    for index, document in enumerate(doc_list):
      embeddings = self.model.encode(document)
      for i in embeddings:
        corpus_index.append(path_list[index])
        corpus_embeddings.append(i)
    return corpus_index, corpus_embeddings

  def encode_query(self, query):
    #model = SentenceTransformer(self.model_name)
    query_embeddings = self.model.encode(query)
    return query_embeddings

In [74]:
search_ind = bert_index('multi-qa-MiniLM-L6-cos-v1')

In [75]:
paths = search_ind.compile_path('/content/')

In [76]:
paths

['/content/212100JUL17 MOCK Order to 233.docx',
 '/content/FRAGO to OPORD 21-001 TSC BWC 31JAN2021.docx',
 '/content/Untitled Folder/OPORD 22-10 (Ranger Buddy Competition).docx']

In [77]:
docs = search_ind.import_corpus(paths)

In [78]:
docs

[['\t\t\t\t\t\t\t\t\t                                                                                              Copy __ of __ copies',
  '\t\t\t\t\t\t\t\t        \t        OC/T’s ',
  '\t\t                                                                                              FHL, CA',
  '\t\t                                                                                              212100JUL17',
  '',
  'FRAGMENTARY ORDER MR 413-233-0008 to Operation Gate Keeper (Unclassified)',
  '',
  '(U) References:  ',
  '',
  '– c. (U) No change. ',
  '',
  '(U) Time Zone used Throughout the OPLAN/OPORD: Tango',
  '',
  '(U) SITUATION. No change.',
  '',
  '         c. (U) Enemy Forces. Special Purpose Forces (SPF) will attempt to destroy the supply lines running through LSA FHL and supporting the 3rd Infantry Division. SPF Will also attempt to gather information on units and agencies within LSA FHL. ',
  '',
  '\t\t(1) MLCOA: SPF operate in 3-5 man teams, use small mobile unmarked ve

In [79]:
proc_docs = search_ind.process_corpus(docs)

In [80]:
proc_docs

[['Copy __ of __ copies',
  'OC/T’s',
  'FHL, CA',
  '212100JUL17',
  'FRAGMENTARY ORDER MR 413-233-0008 to Operation Gate Keeper (Unclassified)',
  '(U) References:',
  '(U) No change',
  '(U) Time Zone used Throughout the OPLAN/OPORD: Tango',
  '(U) SITUATION',
  'No change',
  '(U) Enemy Forces',
  'Special Purpose Forces (SPF) will attempt to destroy the supply lines running through LSA FHL and supporting the 3rd Infantry Division',
  'SPF Will also attempt to gather information on units and agencies within LSA FHL',
  '(1) MLCOA: SPF operate in 3-5 man teams, use small mobile unmarked vehicles and attack logistical convoys and unprotected single vehicles movements',
  '(2) MDCA: SPF operate in 3-8 man teams and attack the civilian populace while destroying the route infrastructures (Bridges, narrow passage ways)',
  '(U) Friendly Forces: No change',
  '(U) Interagency, Intergovernmental, and Nongovernmental Organizations: No Change',
  '(U) Civil Considerations: No Change',
  '(U)

In [81]:
docrefs, embs = search_ind.create_index(proc_docs, paths)

In [82]:
print(docrefs)
print(len(docrefs))

['/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/212100JUL17 MOCK Order to 233.docx', '/content/21

In [83]:
print(embs)

[array([-9.60344225e-02, -4.13563401e-02, -3.99562381e-02,  1.41698020e-02,
       -2.65286528e-02, -5.31508289e-02,  5.60514852e-02,  1.01663172e-03,
        3.89301777e-02, -4.97084633e-02,  1.18047118e-01,  1.74917113e-02,
        1.72665864e-02, -6.33297712e-02, -7.10199922e-02, -1.96235310e-02,
       -4.82642725e-02,  6.15502186e-02, -6.09937590e-03, -8.37092288e-03,
       -2.20264420e-02,  1.03579946e-02, -4.96159941e-02,  8.82762447e-02,
        4.02521864e-02,  6.62735151e-03, -8.55540633e-02, -2.22709663e-02,
       -3.40277515e-02, -5.72428387e-03, -5.38877323e-02,  1.58576183e-02,
       -5.35364896e-02, -4.76571433e-02,  2.56226640e-02,  1.35338604e-01,
        3.49531174e-02, -1.45171927e-02,  4.91163954e-02,  5.67700528e-03,
        3.03107891e-02, -7.22936094e-02, -3.56507190e-02,  2.39345860e-02,
       -1.04484849e-01,  6.15023300e-02, -4.52049496e-03,  1.27005383e-01,
        5.38547412e-02, -2.72404011e-02, -3.94619443e-02,  2.18748823e-02,
       -1.16362691e-01, 

In [90]:
a = search_ind.encode_query('scheme of maneuver')

In [91]:
from sklearn.metrics.pairwise import cosine_similarity

In [92]:
print(cosine_similarity([a],embs))
print(len(cosine_similarity([a],embs)[0]))

[[-5.57061285e-02  9.24700797e-02  5.08311167e-02  5.03067337e-02
   1.22398190e-01  3.83950993e-02  1.12771980e-01  1.69292748e-01
   1.74829125e-01  9.34526920e-02  1.79065719e-01  9.58703458e-02
  -6.47435784e-02  1.64406046e-01  2.14009434e-01  1.94678292e-01
  -9.66560245e-02  5.21197133e-02  6.28870167e-03  1.40703410e-01
   1.40343606e-01  1.12943649e-01  1.73245236e-01  2.71699727e-01
   6.25281334e-01  1.22215196e-01  9.36257653e-03  4.10695344e-01
   1.32369131e-01 -3.75707783e-02  2.97467381e-01  1.72857374e-01
   3.02813709e-01  1.37186915e-01  2.03890324e-01  1.15407310e-01
   2.90876418e-01  2.60622710e-01  1.63041040e-01  3.65624800e-02
   2.84095407e-01  1.05558574e-01  1.85032040e-01 -5.84129244e-03
   2.10984442e-02  2.16723010e-01  1.03031486e-01  8.83292779e-02
   3.55993584e-02 -8.75519142e-02  9.49250013e-02 -8.64468068e-02
   2.03986168e-02  1.52498120e-02  2.78205685e-02  1.62537873e-01
   1.83193624e-01  4.42611948e-02  1.74829125e-01  3.92557830e-02
  -9.42999

In [93]:
refdict = {i: cosine_similarity([a],embs)[0][i] for i in range(len(docrefs))}
for w in sorted(refdict, key=refdict.get, reverse=True):
  print(w, refdict[w])
  print(flatlist[w])

216 0.9340292
Scheme of Movement and Maneuver
24 0.62528133
(1) Scheme of maneuver: Unit will travel to objective with no less than 3 platforms to objectives
27 0.41069534
Company is to decisively engage the opposition in rout and on the objective to disrupt any and all offensive maneuvers
163 0.33088174
The route is fairly flat mostly on paved surfaces
302 0.30783045
(1) Plan and execute RBC in conjunction with the 3rd BDE
32 0.3028137
(2) Routes: SP from TAA Schoonover, R1, R2, R3, RP at Grid FE69067650
30 0.29746738
(U) Coordinating Instructions:
193 0.29508194
Concept of Operations
395 0.29138106
Concept of Operations Brief
36 0.29087642
Concept of Operations:
89 0.28815532
(U) Coordinating Instructions
40 0.2840954
(3) Prepare terrain models for briefing Company Defense Plan
303 0.27781504
Coordinating Instructions
23 0.27169973
(U) Concept of Operation
329 0.26284385
(c) Rules: See MOI (para 5)
206 0.26207894
These lanes begin with the completion of the 9-15 Mile Foot March
184 0

In [88]:
flatlist = [a for b in proc_docs for a in b]

In [89]:
len(flatlist)

397