In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

%matplotlib inline
from datetime import datetime
import humanize
import matplotlib.pyplot as plt
import time
import winsound

bin_count = 12
duration = 1000  # milliseconds
freq = 880  # Hz
height_inches = 3.0
width_inches = 18.0

In [4]:

t0 = time.time()
try:
    
    # Get the Neo4j driver
    from storage import Storage
    s = Storage()

    from ha_utils import HeaderAnalysis
    ha = HeaderAnalysis(s=s, verbose=False)

    from scrape_utils import WebScrapingUtilities
    wsu = WebScrapingUtilities(s=s)
    uri = wsu.secrets_json['neo4j']['connect_url']
    user =  wsu.secrets_json['neo4j']['username']
    password = wsu.secrets_json['neo4j']['password']

    from cypher_utils import CypherUtilities
    cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)
    
    import warnings
    warnings.filterwarnings('ignore')
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
# winsound.Beep(freq, duration)
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 1 second
Last run on 2023-02-15 12:35:56.192218


In [11]:


# Get each labeled POS symbol and balance the bottom sampling max of them
import pandas as pd

cypher_str = """
    MATCH (pos:PartsOfSpeech)-[r:SUMMARIZES]->(np:NavigableParents)
    RETURN
        np.navigable_parent AS navigable_parent, 
        pos.pos_symbol AS pos_symbol;"""
pos_df = pd.DataFrame(cu.get_execution_results(cypher_str, verbose=False))
print(pos_df.value_counts('pos_symbol').sum())
display(pos_df.value_counts('pos_symbol'))

10635


pos_symbol
O-RQ    3861
O-PQ    1625
O-TS    1190
H-RQ    1065
H-TS     408
O-SP     389
H-PQ     347
H-CS     285
O-CS     263
O-IP     245
H-SP     153
H-O      106
O-OL      96
H-OL      84
O-LN      80
H-IP      77
H-LN      61
H-JT      61
O-O       60
O-JT      43
O-ER      39
O-JD      37
H-JD      28
H-ER      17
O-PD       9
H-PD       6
dtype: int64

In [20]:

from imblearn.under_sampling import RandomUnderSampler

sampling_max = 50

# Rebalance the data with a limiting sampling strategy
counts_dict = pos_df.groupby('pos_symbol').count().navigable_parent.to_dict()
sampling_strategy = {k: min(sampling_max, v) for k, v in counts_dict.items()}
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)

# Define the tuple of arrays
data = rus.fit_resample(
    pos_df.navigable_parent.values.reshape(-1, 1), pos_df.pos_symbol.values.reshape(-1, 1)
)

# Recreate the Pandas DataFrame
pos_df = pd.DataFrame(data[0], columns=['navigable_parent'])
pos_df['pos_symbol'] = data[1]

print(sampling_max, pos_df.value_counts('pos_symbol').sum())
display(pos_df.value_counts('pos_symbol'))

50 1129


pos_symbol
H-CS    50
H-SP    50
O-SP    50
O-RQ    50
O-PQ    50
O-OL    50
O-O     50
O-LN    50
O-IP    50
H-TS    50
O-CS    50
H-RQ    50
H-PQ    50
H-OL    50
H-O     50
H-LN    50
H-JT    50
H-IP    50
O-TS    50
O-JT    43
O-ER    39
O-JD    37
H-JD    28
H-ER    17
O-PD     9
H-PD     6
dtype: int64

7154