In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

%matplotlib inline
from IPython.display import clear_output, display
from cohere.error import CohereAPIError
from langchain.chains import LLMChain
from langchain.llms import Cohere, OpenAI
from langchain.prompts import PromptTemplate
from openai.error import RateLimitError
from pandas import DataFrame
from ratelimit import limits, sleep_and_retry
from tqdm.notebook import tqdm
import humanize
import numpy as np
import os
import re
import sys
import time
import warnings
import winsound

warnings.filterwarnings('ignore')
duration = 1000  # milliseconds
freq = 880  # Hz

# Insert at 1, 0 is the script path (or '' in REPL)
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

In [3]:

# Get the Storage object
from storage import Storage
s = Storage(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)

# Get the WebScrapingUtilities object
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=s,
    secrets_json_path=os.path.abspath('../data/secrets/jh_secrets.json')
)

# To get your API key, visit https://serpapi.com/dashboard
os.environ['SERPAPI_API_KEY'] = wsu.secrets_json['SERPAPI_API_KEY']

# To get your API key, visit https://dashboard.cohere.ai/api-keys
os.environ['COHERE_API_KEY'] = wsu.secrets_json['Cohere_API_Key']

# To get your API key, visit https://beta.dreamstudio.ai/membership
os.environ['STABILITY_KEY'] = wsu.secrets_json['Dream_Studio_API_Key']

In [4]:

# Get the HeaderAnalysis object
from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

# Get the CypherUtilities object and Neo4j driver
from cypher_utils import CypherUtilities
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']
cu = CypherUtilities(
    uri=uri, user=user, password=password, driver=None, s=s, ha=ha
)

# Get the SectionLRClassifierUtilities object
from section_classifier_utils import SectionLRClassifierUtilities
slrcu = SectionLRClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Check if the slrcu has built its parts-of-speech logistic regression elements
t1 = time.time()
if not hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    slrcu.build_pos_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(slrcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech logistic regression elements built in {duration_str}')

I have 49,102 labeled parts of speech in here
predict_single is now available
Parts-of-speech logistic regression elements built in 9 seconds


In [5]:

# Get the SectionCRFClassifierUtilities object
from section_classifier_utils import SectionCRFClassifierUtilities
scrfcu = SectionCRFClassifierUtilities(cu=cu, ha=ha, verbose=False)

# Check if the scrfcu has built its parts-of-speech conditional random field elements
t1 = time.time()
if not hasattr(scrfcu, 'pos_symbol_crf'):
    scrfcu.build_pos_conditional_random_field_elements(verbose=True)
if hasattr(scrfcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech conditional random field elements built in {duration_str}')

predict_single is now available
Parts-of-speech conditional random field elements built in 1 second


In [6]:

# Get the SectionSGDClassifierUtilities object
from section_classifier_utils import SectionSGDClassifierUtilities
ssgdcu = SectionSGDClassifierUtilities(ha=ha, cu=cu, verbose=False)

# Check if the ssgdcu has built its parts-of-speech stochastic gradient decent elements
t1 = time.time()
if not hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    ssgdcu.build_pos_stochastic_gradient_descent_elements(sampling_strategy_limit=None, verbose=True)
if hasattr(ssgdcu, 'pos_predict_percent_fit_dict'):
    print('predict_single is now available')
else:
    print('predict_single is not available')
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'Parts-of-speech stochastic gradient descent elements built in {duration_str}')

I have 49,102 labeled parts of speech in here
predict_single is now available
Parts-of-speech stochastic gradient descent elements built in 9 seconds


In [7]:

tf_cypher_str = """
    // Find all properties with a value of "False"
    CALL db.propertyKeys() YIELD propertyKey AS prop_name
    MATCH (n)
    WHERE n[prop_name] = "False"
    WITH DISTINCT prop_name, labels(n)[0] AS node_name
    RETURN prop_name, node_name;"""
with cu.driver.session() as session: tf_df = DataFrame(session.write_transaction(cu.do_cypher_tx, tf_cypher_str))
display(tf_df.head(20))

In [8]:

for row_index, row_series in tf_df.iterrows():
    prop_name = row_series.prop_name
    node_name = row_series.node_name
    def do_cypher_tx(tx):
        cypher_str = f"""
            // Replace "True" with true and "False" with false
            MATCH (n:{node_name})
            SET n.{prop_name} = CASE n.{prop_name}
              WHEN "False" THEN false
              WHEN "True" THEN true
              ELSE n.{prop_name}
            END;"""
        tx.run(query=cypher_str, parameters={})
    with cu.driver.session() as session: session.write_transaction(do_cypher_tx)

In [11]:

# Fix all the pickles
for file_name in os.listdir(s.saves_pickle_folder):
    if file_name.endswith('.pkl'):
        pickle_name = file_name[:-4]
        if pickle_name.endswith('_df'):
            if s.pickle_exists(pickle_name):
                try:
                    df = s.load_object(pickle_name)
                    for cn in df.columns:
                        if ('True' in df[cn].unique()) or ('False' in df[cn].unique()):
                            print(pickle_name, cn)
                except Exception as e:
                    print(f'{e.__class__.__name__} error trying to load {pickle_name}: {str(e).strip()}')

<class 'TypeError'> error trying to load davez_hunting_df: Date.__new__() missing 3 required positional arguments: 'year', 'month', and 'day'
<class 'TypeError'> error trying to load hunting_df: Date.__new__() missing 3 required positional arguments: 'year', 'month', and 'day'
