In [1]:
from flask import Flask, request, jsonify
import redis
import csv
import msgpack
import pandas as pd
import numpy as np
import time
import argparse
import difflib
import json
import time
import pickle
from collections import defaultdict
import tensorflow
from keras_preprocessing.text import tokenizer_from_json
from keras.models import load_model
from keras_preprocessing.sequence import pad_sequences

In [2]:
with open('../englishwords.json') as f:
    data = json.load(f)

words = data.keys()
words = list(words)

In [2]:
class TrieNode_dict:
    def __init__(self):
        self.children = defaultdict(TrieNode_dict)
        self.is_word = False
        self.description = None
        

class Trie_dict:
    def __init__(self):
        self.root = TrieNode_dict()
        self.count = 0
        
    def insert(self, word, description=None):
        current = self.root
        for char in word:
            current = current.children[char]
        if not current.is_word:
            current.is_word = True
            self.count += 1
        current.description = description
    
    def search(self, word):
        current = self.root
        for char in word:
            if char not in current.children:
                return None
            current = current.children[char]
        if current.is_word:
            return current.description
        return None

    def insert_list(self, lst):
        for word in lst:
            self.insert(word) 

    def size(self):
        return self.count
    
    
    def insert_dict(self, dict_obj):
        for key, definition in dict_obj.items():
            self.insert(key, definition)

    
    def fuzzy_search(self, word, cutoff=0.6):
        results = difflib.get_close_matches(word, self.words(), n=10, cutoff=cutoff)
        return {result: (self.search(result), difflib.SequenceMatcher(None, word, result).ratio()) for result in results}
        
    def words(self):
        words = []
        def dfs(node, word):
            if node.is_word:
                words.append(word)
            for char in node.children:
                dfs(node.children[char], word + char)
        dfs(self.root, "")
        return words

#trie_dict.insert_list(words)

In [5]:
import wordcloud
import nltk
from nltk.corpus import wordnet 

nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/lingechettyr/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lingechettyr/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lingechettyr/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lingechettyr/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lingechettyr/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
# Main Function
dict_obj = defaultdict()

def get_definition(word):
    synsets = wordnet.synsets(word)
    return synsets[0].definition() if synsets else None
  
for i in range(len(words)):
  definition = get_definition(words[i])
  if definition != None:
    dict_obj[words[i]] = definition

In [10]:
dict_obj['around']

'in the area or vicinity'

In [13]:
with open('../english_dict.pkl', 'wb') as f:
    pickle.dump(dict_obj, f)

In [29]:
with open('../english_dict.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

In [30]:
loaded_dict['test']

'trying something to find out about it'

In [31]:
trie_dict_test = Trie_dict()

In [32]:
trie_dict_test.insert_dict(loaded_dict)

In [34]:
trie_dict_test.fuzzy_search("about")

{'about': ('on the move', 1.0),
 'bout': ('(sports) a division during which one team is on the offensive',
  0.8888888888888888),
 'abut': ('lie adjacent to another or share a boundary', 0.8888888888888888),
 'abought': ('make amends for', 0.8333333333333334),
 'sabot': ('a shoe carved from a single block of wood', 0.8),
 'jabot': ("a ruffle on the front of a woman's blouse or a man's shirt", 0.8),
 'cabot': ('son of John Cabot who was born in Italy and who led an English expedition in search of the Northwest Passage and a Spanish expedition that explored the La Plata region of Brazil; in 1544 he published a map of the world (1476-1557)',
  0.8),
 'bouts': ('(sports) a division during which one team is on the offensive',
  0.8),
 'abuts': ('lie adjacent to another or share a boundary', 0.8),
 'abort': ('the act of terminating a project or procedure before it is completed',
  0.8)}

In [17]:
def take_time(df, r):
    start = time.time()
    r.set("key", msgpack.packb(df.to_dict('records')))
    end = time.time()
    elapsed = end - start
        
    print({'message': f'Data stored successfully. Time elapsed: {elapsed} seconds'})

In [42]:
csv_file = 'vasu_df.csv'
    
    # Read data from CSV
df = pd.read_csv(csv_file)
df_fifty_lines = df.sample(n = 50)
df_1000_lines = df.sample(n = 1000)
df_10000_lines = df.sample(n  =10000)
df_30000_lines = df.sample(n = 30000)
    
    # Store data in Redis
r = redis.StrictRedis(host='localhost', port=6379, db=0)

take_time(df_fifty_lines, r)
take_time(df_1000_lines, r)
take_time(df_10000_lines, r)
take_time(df_30000_lines, r)
take_time(df, r)

{'message': 'Data stored successfully. Time elapsed: 0.0010979175567626953 seconds'}
{'message': 'Data stored successfully. Time elapsed: 0.004744529724121094 seconds'}
{'message': 'Data stored successfully. Time elapsed: 0.03921985626220703 seconds'}
{'message': 'Data stored successfully. Time elapsed: 0.12122011184692383 seconds'}
{'message': 'Data stored successfully. Time elapsed: 0.20700693130493164 seconds'}


In [83]:
def get_trie_from_redis(redis_host='localhost', redis_port=6379, redis_db=0):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    trie = Trie_dict()
    keys = r.keys()
    for key in keys:
        word = key.decode('utf-8')
        try:
            description = r.get(word).decode('utf-8')
        except UnicodeDecodeError as e:
            description = 'Could not decode it'
        trie.insert(word, description)
    return trie

In [41]:

def store_in_redis(trie_dict, r, redis_host='localhost', redis_port=6379, redis_db=0):
    for word in trie_dict.words():
        description = trie_dict.search(word)
        r.set(word, description)

def get_trie_from_redis(redis_host='localhost', redis_port=6379, redis_db=0):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    trie = Trie_dict()
    keys = r.keys()
    for key in keys:
        word = key.decode('utf-8')
        try:
            description = r.get(word).decode('utf-8')
        except UnicodeDecodeError as e:
            description = 'Could not decode it'
        trie.insert(word, description)
    return trie
start_time_store = time.time()
store_in_redis(trie_dict, r)
end_time_store = time.time()
store_time = end_time_store - start_time_store

start_time_get = time.time()
trie_dict2 = get_trie_from_redis()
end_time_get = time.time()
get_time = end_time_get - start_time_get

print(f'Time it took to store {trie_dict.size()} in Redis: {store_time}')
print(f'Time it took to retrieve {trie_dict2.size()} in Redis: {get_time}')

Time it took to store 112272 in Redis: 12.711305141448975
Time it took to retrieve 112278 in Redis: 13.978270530700684


In [82]:
start_time_get = time.time()
trie_dict2 = get_trie_from_redis()
end_time_get = time.time()
get_time = end_time_get - start_time_get

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa5 in position 14: invalid start byte

In [3]:
def from_df_to_dict(df):
    first_column = df.iloc[:, 0].tolist()
    second_column = df.iloc[:, 1].tolist()
    #dict_object = my_dictionary()
    dict_object = defaultdict()
    for i in range(len(first_column)):
        dict_object[first_column[i]]=  second_column[i]
    
    return dict_object
    

In [None]:
class MLModel:
    def __init__(self):
        pass

    def tokenizerImport(self):
        with open('tokenizer.json') as f:
            data = json.load(f)
            tokenizer = tokenizer_from_json(data)
        return tokenizer
    
    def modelImport(self):
        model = load_model('model_general_1.h5')
        return model
        
    def build_definition(self, seed_text, tokenizer, next_words, model, max_sequence_len):
        res = []
        for _ in range(next_words):
            token_list = tokenizer.texts_to_sequences([seed_text])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
            predicted = model.predict(token_list)
            predicted = np.argmax(predicted, axis = 1)
            
            output_word = ""
            for word,index in tokenizer.word_index.items():
                if index == predicted:
                    output_word = word
                    break
            res.append(output_word)
            seed_text += " " + output_word
        return res

In [None]:
def navigate_trie(trie_dict):
    # ML_used = False
    # Prompt the user to enter a search term
    print('Enter the term for which you would like the definition')
    search = input()
    
    # Use the trie data structure to perform a fuzzy search on the search term
    search_result = list(trie_dict.fuzzy_search(search))
    list_length = len(search_result)
    
    # If the search term is not found in the trie, prompt the user to define it
    if list_length == 0:
        print('It is not in our list, please define it yourself')
        definition = input()
        #fine_tune(search, definition)
        print(f"Thanks, I've learned the definition of '{search}'.")
        
    # If the search term is found in the trie, present the user with a list of search results
    else:
        for i, item in enumerate(search_result):
            print(f"{i+1}. {item}")
        
        # Prompt the user to select a search result from the list
        print('If the desired item is in the list, type Y')
        choice = input()
        if choice == 'Y':
            print('Now type the number associated to the desired term')
            user_choice = int(input())
            
            # Ensure that the user's choice is a valid index in the list of search results
            1 <= user_choice <= len(search_result)
            selected_item = search_result[user_choice - 1]
            print(f"You selected: {selected_item}")
            
            # Look up the definition of the selected term in the trie data structure
            word_definition = trie_dict.search(selected_item)
            
            # If the term has no definition in the trie, prompt the user to define it
            if word_definition == None:
                print(f'No previous definition has been found, however {selected_item} is commonly referred to as:\n ')
                print(get_definition(selected_item))
                print('Now you can define it yourself')
                custom_definition = input()
                trie_dict.insert(selected_item, custom_definition)
                print('Thank you, I learned a new word!')
            
            # If the term has a definition in the trie, present the definition to the user and prompt for redefinition
            else:
                print(f'The definition for {selected_item} is : {word_definition}')
                print(f'Do you like it? Type "Y" if so, if not you"ll redefine it')
                redefinition_choice = input()
                
                # If the user chooses to redefine the term, prompt for a new definition and update the trie
                if redefinition_choice != 'Y':
                    print('Type it in:')
                    custom_definition = input()
                    trie_dict.insert(selected_item, custom_definition)
                    print('Thank you, I learned a new word!')
        else:
            print(f'Run ML model? (Type "Y" for yes and "N" for no)')
            runChoice = input()
            if runChoice == "Y":
                print(f'"{search}" is not in the list, running ML model to generate definition')
                ml_model = MLModel()
                tokenizer = ml_model.tokenizerImport()
                model = ml_model.modelImport()
                res = ml_model.build_definition(search, tokenizer, 5, model, 54)
                
                for i in range(len(res)):
                    print(f"{i+1}. {res[i]}")
                
                print(f'If you see a definition that you are satisfied with, select the corresponding number - else if you are not satisifed with any option, type "N": ')
                mlChoice = input()

                if mlChoice != 'N':
                    trie_dict.insert(search, res[int(mlChoice)])
                else:
                    print('Add your own definition: ')
                    definition_nbs = input()
                    trie_dict.insert(search, definition_nbs)
            else:
                print('Add your own definition: ')
                definition_nbs = input()
                trie_dict.insert(search, definition_nbs)
            

# BEGINNING OF DEMO

In [6]:
trie_demo = Trie_dict()
#connection = redis.Redis(host='localhost', port=6379, db=0)

### Vasu's SAP data is imported in the notebook, and fed into the model

In [7]:
csv_file = '../vasu_df.csv'
# Read data from CSV
df_vasu = pd.read_csv(csv_file)
#Transform it into a dictionary
dictionary_vasu = from_df_to_dict(df_vasu)
#Load it to the trie tree
for key in dictionary_vasu.keys():
    definition = str(dictionary_vasu[key])
    trie_demo.insert(str(key), definition)


In [8]:
dictionary_vasu

defaultdict(None,
            {'AACOM': 'Ordinary_730-4 MunSurchAdvPayToBeDeducted TotalAmount',
             'AACOMC': 'Ordinary_730-4 MunSurchAdvPayToBeDeducted SpouseAmount',
             'AACOMD': 'Ordinary_730-4 MunSurchAdvPayToBeDeducted DeclarantAmount',
             'AAC_TYPE': "'X' = statistical accounting object",
             'AAO_GAK': 'Work Incapacity Notification for Joint Admin. Office (GAK)?',
             'AATS': 'Values Registered: Select/Deselect',
             'AAW_WAO': "Benefits under 'AAW'/'WAO'",
             'ABAPFORM': 'ABAP/4 FORM routine name, called by external program',
             'ABAPLCNT': 'SAP Workload: Workload: integer without a leading sign',
             'ABAPLINE': 'ABAP Line Number',
             'ABAPLOCAT': 'ABAP location',
             'ABAPPROG': 'ABAP program, current main program',
             'ABAPREPORT': 'ABAP name',
             'ABAPSCNT': 'SAP Workload: Workload: integer without a leading sign',
             'ABAPTYPE': 'ABAP Type 

### Trie tree is loaded, time to search some elements

In [68]:
#Simple search of a term we KNOW is present
trie_demo.search('AAO_GAK')

'Work Incapacity Notification for Joint Admin. Office (GAK)?'

In [69]:
#Fuzzy search of a term that RESEMBLES a term present in the structure
trie_demo.fuzzy_search('AAO_GrK	')

{'AAO_GAK': ('Work Incapacity Notification for Joint Admin. Office (GAK)?',
  0.8)}

### The 'Navigate' function 

In [73]:
#Function where everything goes according to plan 
navigate_trie(trie_demo)

Enter the term for which you would like the definition
1. AAO_GQL
2. AAO_GAK
3. VALTO_AGL
4. CATCA_QAL
If the desired item is in the list, type Y
Now type the number associated to the desired term
You selected: VALTO_AGL
The definition for VALTO_AGL is : Date Up to Which the Contents of the Agreement Are Valid
Do you like it? Type "Y" if so, if not you"ll redefine it


In [74]:
#Function where the user does not like the definition provided by the structure
navigate_trie(trie_demo)

Enter the term for which you would like the definition
1. AAO_GQL
2. OSL_S
3. NO_SLOTS
4. DATAOBJ_CLS
5. BAS_SALS
6. AVG_SALS
7. AUTO_POSS
8. OS_SYS
If the desired item is in the list, type Y
Now type the number associated to the desired term
You selected: DATAOBJ_CLS
The definition for DATAOBJ_CLS is : Data object class
Do you like it? Type "Y" if so, if not you"ll redefine it
Type it in:
Thank you, I learned a new word!


In [75]:
trie_demo.search('DATAOBJ_CLS')

'This is a definition I like better'

In [76]:
#Scenario where the term we are looking for simply isn't in  the structure
navigate_trie(trie_demo)

Enter the term for which you would like the definition
1. WID
2. WPID
3. WGID
If the desired item is in the list, type Y
"WIDJDW" is not in the list, add it to the database!
Add a definition:


In [77]:
#And here it is, added to the structure
trie_demo.search('WIDJDW')

'I just made this up'

## The trie structure is then loaded into a Redis server

In [79]:
r = redis.StrictRedis(host='localhost', port=6379, db=0)
store_in_redis(trie_demo, r)

## And can also be taken out

In [84]:
trie_demo2 = get_trie_from_redis()
trie_demo2.search('WIDJDW')

'I just made this up'