In [None]:
# http://annamarbut.blogspot.com/2018/08/whoosh-pandas-and-redshift-implementing.html

In [1]:
import pandas as pd
import glob
import os, os.path

from whoosh.fields import Schema, TEXT
from whoosh import index
from whoosh import index
from whoosh import qparser

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

### Data import

In [3]:
%%time
Run = "../workproduct-files/batchRuns/"
Run_files = glob.glob(Run + "*.pkl")
RunDF = pd.DataFrame()
for filename in Run_files:
    RunDF = RunDF.append(pd.read_pickle(filename))
RunDF = RunDF.sort_index()

Wall time: 2min 41s


In [4]:
%%time
t_data = pd.read_pickle("../workproduct-files/t_dataMaster-keywordsIdentified.pkl")

Wall time: 716 ms


In [5]:
#Combine dataframes leaving out searchTerms from RunDF (duplicate in result)
t_dataRun = pd.concat([t_data, RunDF.iloc[:,1:]], axis = 1, join = "inner")

In [84]:
DF_for_index = t_dataRun

In [85]:
%%time
DF_for_index["wikiMetaPath_str"] = DF_for_index["categoryPath"].apply(lambda x: str(list(x.iloc[:,0])) if type(x) != type(None) else "")
DF_for_index["wikiMetaParents_str"] = DF_for_index["parentCategories"].apply(lambda x: str(list(x.iloc[:,0])) if type(x) != type(None) else "")

Wall time: 7.28 s


### Creating search index

In [86]:
schema = Schema(question = TEXT (stored = True,  field_boost = 2.0), wikiMetaPath = TEXT, wikiMetaParents = TEXT)

In [87]:
# create and populate index
def populate_index(dirname, dataframe, schema):
    
    # Checks for existing index path and creates one if not present
    if not os.path.exists(dirname):
        os.mkdir(dirname)
    print("Creating the Index")
    ix = index.create_in(dirname, schema)
    
    # Imports data from pandas df
    with ix.writer() as writer:
        print("Populating the Index")
        for i in dataframe.index:
            add_data(i, dataframe, writer)

In [88]:
def add_data(i, dataframe, writer):
    
    # writer.update_document(question = str(dataframe.loc[i, "CONS_question"]), wikiMetaPath = str(list(t_dataRun.loc[i, "categoryPath"].iloc[:,0])), wikiMetaParents = str(list(t_dataRun.loc[i, "parentCategories"].iloc[:,0])))
    writer.update_document(question = str(dataframe.loc[i, "CONS_question"]), wikiMetaPath = str(dataframe.loc[i, "wikiMetaPath_str"]), wikiMetaParents = str(dataframe.loc[i, "wikiMetaParents_str"]))

In [89]:
%%time
# Create the index
populate_index("tData_Index1", DF_for_index, schema)

Creating the Index
Populating the Index
Wall time: 44.2 s


### Creating index searcher

In [121]:
def index_search(dirname, search_fields, search_query):
    ix = index.open_dir(dirname)
    schema = ix.schema
    out = []
    
    # Create query parser that looks through designated fields in index
    og = qparser.OrGroup.factory(0.9)
    mp = qparser.MultifieldParser(search_fields, schema, group = og)
    
    # This is the user query
    q = mp.parse(search_query)
    
    # Actual searcher, prints top 10 hits
    with ix.searcher() as s:
        results = s.search(q, limit = 100)
        #print("Search Results: ")
        for row in results:
            #print(row)
            out.append(row["question"])
    return out

### Index search

In [130]:
%%time
a = index_search("tData_Index1", ['question', 'wikiMetaPath', 'wikiMetaParents'], "key*")

Wall time: 45.9 ms


In [131]:
print(len(a))
for row in a:
    print(row)

74
How many keys are on a standard concert piano?
How many keys are there on a grand piano?
What key is to the right of T on a keyboard?
On a standard keyboard which is the largest key?
The longest key on your keyboard is the _____ bar
On A Standard Computer Keyboard Which Key Is The Largest?
What is the largest key on a standard computer keyboard?
What is the reason behind the layout of the Qwerty keyboard?
What would you do with a celesta?
Which organist who died in 1985 was traditionaly associated with Blackpool Tower?
In Whose Band Is Madonna Wayne Gacy The Keyboardist?
What nationality is the keyboards wizard Vangelis?
What Is The Name Given To A Piano That Plays Mechanically?
This instrument has black and white keys?
Who was the 'keymaster' in Ghostbusters?'?
Ancel Keys developed which US soldiers item?
Which of the Beatles group played piano on Don't Pass Me By?
In Which Keyboard Instrument Are The Strings Plucked Not Struck?
In what key is the dialtone of a telephone?
In whose 

In [220]:
%%time
b = index_search("tData_Index1", ['wikiMetaPath', 'wikiMetaParents'], "*architecture*")

Wall time: 712 ms


In [221]:
print(len(b))
for row in b:
    print(row)

100
Which Famous London Landmark Was Designed by Sir Norman Foster?
In a church, the area where the transept and the nave intersect, usually emphasized by a dome or a tower.
Where is the Taj Mahal?
Who built the Taj Mahal?
What in muslim countries is a' taj?
Which Lyricist Worked With Richard Rogers On Such Songs As Blue Moon , Where Or When, & My Funny Valentine?
What is the Taj Majal made of?
Who Released The 70's Album Entitled My Aim is True?
Where would you find a parlour, scriptorium, dorter and cellarium?
Where would you find a bema narthex and apse?
Jorn Utzon of Denmark designed what landmark?
Which Famous Building Was Designed By Jorn Utzon?
Which religion's holiest shrine is the Golden Temple at Amritsar?
In nautical terms, what name is given to the upper edge of a ship's side?
What does a mosque's mihrab indicate the direction to?
Which Famous Venetian Merchant Travelled To China And Worked For Kublai Khan?
Taidje Khan became famous under which name?
Which American Architec

### Testing

In [53]:
type(None)

NoneType

In [75]:
DF_for_index.shape

(3000, 32)

In [6]:
t_dataRun.columns

Index(['CONS_id', 'CONS_question', 'CONS_answer', 'CONS_alt answers',
       'CONS_category', 'CONS_alt categories - NOT USED',
       'CONS_type-formulation', 'CONS_type-multipleChoice', 'ORIG_id',
       'ORIG_question', 'ORIG_answer', 'ORIG_alt answers', 'ORIG_category',
       'ORIG_alt categories', 'ORIG_difficulty', 'ORIG_type', 'Source',
       'Duplicate_removed', 'namedEntities', 'nouns', 'objects', 'subjects',
       'nounsObjectsSubjects', 'searchTerms', 'wikipediaSearchSuccessful',
       'findQuestionCategories_meta', 'wikipediaArticleTitle',
       'wikipediaArticleID', 'categoryPath', 'parentCategories'],
      dtype='object')

In [14]:
t_dataRun.loc[2, "parentCategories"].iloc[:,0]

0                   Dutch-language_books
1     Personal_accounts_of_the_Holocaust
2        CS1_Dutch-language_sources_(nl)
3                  Forgery_controversies
4     CS1_Japanese-language_sources_(ja)
5                   World_War_II_memoirs
6                    Public_domain_books
7           Memory_of_the_World_Register
8           Books_published_posthumously
9                             Anne_Frank
10                     Jewish_literature
11          Books_relating_to_Anne_Frank
12                      Dutch_literature
13              Books_adapted_into_films
14                               Diaries
15           Doubleday_(publisher)_books
Name: pages.title, dtype: object

In [48]:
str(list(t_dataRun.loc[4, "categoryPath"].iloc[:,0]))

"['Peace', 'Nonviolence', 'Violence', 'Society', 'Main_topic_classifications']"