In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

from neo4j.exceptions import ServiceUnavailable
from nltk import pos_tag
from pandas import DataFrame
import requests
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
import humanize
import nltk
import os
import re
import sys
import time
import warnings

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

def display_url(url_suffix):
    url = f'http://localhost:5000{url_suffix}'
    response = requests.get(url)
    
    return(response.text)

In [None]:

from IPython.display import HTML, display

display(HTML(display_url('/train_pos_classifier')))

In [29]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

try:
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')

from hc_utils import HeaderCategories
hc = HeaderCategories(cu=cu, verbose=False)

from lr_utils import LrUtilities
lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)

from crf_utils import CrfUtilities
crf = CrfUtilities(ha=ha, hc=hc, cu=cu, lru=lru, verbose=True)

duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'Utility libraries created in {duration_str}')

Utility libraries created in 6 seconds



---
# Data Exploration
This stage involves loading and examining the dataset to gain a better understanding of its structure, content, and quality. Data exploration may involve tasks such as data visualization, summary statistics, and data cleaning.

In [30]:

cypher_str = '''
    // Filter for NavigableParents nodes with an unambiguous SUMMARIZES relationship
    MATCH (np:NavigableParents)
    WHERE size((np)<-[:SUMMARIZES]-(:PartsOfSpeech)) = 1
    WITH np
    
    // Find all NavigableParents nodes in the graph with an incoming SUMMARIZES relationship to a PartsOfSpeech node
    MATCH (np)<-[r:SUMMARIZES]-(pos:PartsOfSpeech)
    
    // Return the navigable parent and important properties
    RETURN
        pos.pos_symbol AS pos_symbol,        
        np.navigable_parent AS navigable_parent
    LIMIT 1'''
row_objs_list = []
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    pos_html_strs_df = DataFrame(row_objs_list)
    
    # (46327, 30)
    print(pos_html_strs_df.shape)

(1, 2)


In [18]:

pos_html_strs_df.T

Unnamed: 0,0
pos_symbol,H-TS
navigable_parent,What Youll be Doing


In [25]:

def get_pos_crf_predict_single_from_api(child_str):
    data_dict = {'navigable_parent': child_str}
    response = requests.post('http://localhost:5000/pos_crf_predict_single', json=data_dict)
    response_dict = response.json()
    
    return response_dict['y_pred']
t0 = time.time()
y_pred = get_pos_crf_predict_single_from_api(pos_html_strs_df.navigable_parent.squeeze())
print(f"{{'y_pred': '{y_pred}'}}")
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'CRF Parts-of-Speech Single Prediction inferred in {duration_str}')

{'y_pred': 'H-RQ'}
CRF Parts-of-Speech Single Prediction inferred in 2 seconds


In [26]:

def get_pos_lr_predict_single_from_api(child_str):
    data_dict = {'navigable_parent': child_str}
    response = requests.post('http://localhost:5000/pos_lr_predict_single', json=data_dict)
    response_dict = response.json()
    
    return response_dict['y_pred']
t0 = time.time()
y_pred = get_pos_lr_predict_single_from_api(pos_html_strs_df.navigable_parent.squeeze())
print(f"{{'y_pred': '{y_pred}'}}")
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'LR Parts-of-Speech Single Prediction inferred in {duration_str}')

{'y_pred': 'H-TS'}
LR Parts-of-Speech Single Prediction inferred in 2 seconds


In [23]:

t0 = time.time()
data_dict = {'navigable_parent': pos_html_strs_df.navigable_parent.squeeze()}
response = requests.post('http://localhost:5000/pos_lr_predict_single', json=data_dict)
print(response.json())
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
print(f'LR Parts-of-Speech Single Prediction inferred in {duration_str}')

{'y_pred': 'H-TS'}
LR Parts-of-Speech Single Prediction inferred in 2 seconds


In [39]:

display_url('/new_endpoint/<string_parameter>')

'{\n  "message": "<string_parameter>"\n}\n'

In [40]:

display_url('/lru/sync_basic_quals_dict')

'{\n  "message": "This works"\n}\n'

In [41]:

display_url('/lru/retrain_isqualified_classifier')

'{\n  "message": "This works"\n}\n'

In [42]:

from IPython.display import HTML

results_dict = eval(display_url('/lru/infer_from_hunting_dataframe'))

In [36]:

results_dict['percent_fits_list'][0]['quals_list']

['If you are motivated by solving complex, multivariate problems, enjoy the small-company lifestyle, and are comfortable with the rewards and challenges of working with a diverse array of business, technical, and analytical problems, then please read on',
 '<li>Experience using SQL to interface with database</li>',
 '<p>OB Media is seeking a quantitative-minded professional to work in our online advertising business',
 '<li>Some basic understanding of internet technology</li>',
 '<li>A degree in a STEM discipline, or a good number of related classes</li>',
 '<li>Preference will be given to candidates with strong coding skills (Python, R, Spark, Stata, etc)</li>']

In [28]:

display_url('/lru/display_hunting_dataframe_as_histogram')

'{\n  "message": "This works"\n}\n'

In [None]:

import json
from sklearn_crfsuite.estimator import CRF

# Convert the object to a JSON string
crf_json = json.dumps(crf.CRF.__getstate__())

# Store the JSON string as a property on a node in Neo4j
with cu.driver.session() as session:
    session.run('MERGE (:ModelStore {crf: $crf})', crf=crf_json)

# Retrieve the JSON string from the Neo4j property
with cu.driver.session() as session:
    result = session.run('MATCH (ms:ModelStore) RETURN ms.crf AS crf_json')
    crf_json = result.single()['crf_json']

# Deserialize the JSON string into a dictionary
crf_state = json.loads(crf_json)

# Create a new CRF object using the dictionary state
new_crf = CRF()
new_crf.__setstate__(crf_state)

# Verify the new CRF object
print(type(crf.CRF))
print(type(new_crf))

In [60]:

# Create a CRF object and train it
crf_test = CRF()
X = [['word1', 'word2'], ['word3']]
y = [['tag1', 'tag2'], ['tag3']]
crf_test.fit(X, y)

# Serialize the CRF object state to a JSON-compatible dictionary
crf_state = crf_test.__getstate__()

In [69]:

crf_state_serializable = {}
for key, value in crf_state.items():
    if isinstance(value, CRF) or hasattr(value, '__call__'):
        print(key, vaue)
        break
    try:
        crf_state_serializable[key] = json.dumps(value)
    except TypeError as e:
        print(f'{e.__class__} error: {str(e).strip()}')
        print(key, value)
        print(type(key), type(value))
        continue

<class 'TypeError'> error: Object of type FileResource is not JSON serializable
modelfile <sklearn_crfsuite._fileresource.FileResource object at 0x000001D818CAA0E0>
<class 'str'> <class 'sklearn_crfsuite._fileresource.FileResource'>
<class 'TypeError'> error: Object of type TrainLogParser is not JSON serializable
training_log_ <pycrfsuite._logparser.TrainLogParser object at 0x000001D8254AC310>
<class 'str'> <class 'pycrfsuite._logparser.TrainLogParser'>


In [70]:

crf_state_serializable

{'algorithm': 'null', 'min_freq': 'null', 'all_possible_states': 'null', 'all_possible_transitions': 'null', 'c1': 'null', 'c2': 'null', 'max_iterations': 'null', 'num_memories': 'null', 'epsilon': 'null', 'period': 'null', 'delta': 'null', 'linesearch': 'null', 'max_linesearch': 'null', 'calibration_eta': 'null', 'calibration_rate': 'null', 'calibration_samples': 'null', 'calibration_candidates': 'null', 'calibration_max_trials': 'null', 'pa_type': 'null', 'c': 'null', 'error_sensitive': 'null', 'averaging': 'null', 'variance': 'null', 'gamma': 'null', 'verbose': 'false', 'trainer_cls': 'null', '_tagger': 'null', '_info_cached': 'null'}

In [68]:

value.name

'C:\\Users\\daveb\\AppData\\Local\\Temp\\model2xu_lzao.crfsuite'

In [None]:

# Serialize the dictionary to a JSON string
crf_json = json.dumps(crf_state_serializable)

# Save the JSON string to Neo4j
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))
with driver.session() as session:
    session.run("CREATE (n:MyNode {crf: $crf_json})", crf_json=crf_json)