In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
if ('../py' not in sys.path): sys.path.insert(1, '../py')

In [3]:

# Get the Neo4j driver
from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

from is_header_sgd_classifier import IsHeaderSgdClassifier
ihu = IsHeaderSgdClassifier(ha=ha, cu=cu, verbose=False)

In [4]:

from neo4j.exceptions import ServiceUnavailable

try:
    version_str = cu.driver.verify_connectivity()
    print(f'======== {version_str} ========')
    
    from hc_utils import HeaderCategories
    hc = HeaderCategories(cu=cu, verbose=False)
    
    from section_utils import SectionUtilities
    su = SectionUtilities(verbose=False)
    
    from lr_utils import LrUtilities
    lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
    
    from crf_utils import CrfUtilities
    crfu = CrfUtilities(ha=ha, hc=hc, cu=cu, verbose=False)
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    # print(str(e).strip())
    raise ServiceUnavailable('You need to start Neo4j as a console')
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')



In [8]:

%matplotlib inline
import time
import humanize
import matplotlib.pyplot as plt
from datetime import datetime
import winsound

duration = 1000  # milliseconds
freq = 880  # Hz
width_inches = 18.0
height_inches = 3.0
bin_count = 12
print(f'Last run on {datetime.now()}')

Last run on 2023-01-06 15:10:06.137222



---
# Training

In [9]:

# You need to run this again if you changed the qualification dictionary in another notebook
t1 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
lru.sync_basic_quals_dict(sampling_strategy_limit=None, verbose=False)

ihu.retrain_classifier(new_data_list, new_labels_list, verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Is-header classifier retrained in {duration_str}')

Is-header classifier retrained in 16 seconds


In [11]:

t0 = time.time()
slrcu.build_pos_logistic_regression_elements(verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Parts-of-speech classifier rebuilt in {duration_str}')

Parts-of-speech classifier retrained in 10 minutes and 3 seconds


In [12]:

# Rebuild the classifer from the quals dictionary
t0 = time.time()
lru.build_isqualified_logistic_regression_elements(verbose=False)
lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Is-qualified classifer retrained in {duration_str}')

I have 7,124 hand-labeled qualification strings in here
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_df.pkl
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\ISQUALIFIED_VOCAB.pkl
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\ISQUALIFIED_TT.pkl
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\ISQUALIFIED_LR.pkl
Retraining complete
Is-qualified classifer retrained in 17 seconds


In [None]:

# Get the training data
t0 = time.time()
HEADER_PATTERN_DICT = s.load_object('HEADER_PATTERN_DICT')
X_train = []
y_train = []
for file_name, feature_dict_list in HEADER_PATTERN_DICT.items():
    feature_tuple_list = [hc.get_feature_tuple(feature_dict, pos_lr_predict_single=slrcu.predict_single, pos_crf_predict_single=None, pos_sgd_predict_single=None) for feature_dict in feature_dict_list]
    pos_list = [feature_tuple[2] for feature_tuple in feature_tuple_list]
    y_train.append(pos_list)
    X_train.append(crfu.sent2features(feature_tuple_list))
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Training data created in {duration_str}')

In [20]:

import sklearn_crfsuite
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
from sklearn.model_selection import RandomizedSearchCV

# Define fixed parameters and parameters to search
CRF = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

In [21]:

CRF.fit(X_train, y_train)
labels = list(CRF.classes_)
labels

['O-O', 'H-O', 'O-JT', 'O-OL', 'H-JD', 'H-TS', 'O-TS', 'H-RQ', 'H-PQ', 'H-CS', 'O-CS', 'O-RQ', 'O-LN', 'O-PQ', 'H-SP', 'O-SP', 'H-PD', 'H-OL', 'H-LN', 'O-JD', 'H-ER', 'H-IP', 'O-ER', 'H-JT', 'O-IP']

In [25]:

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# Use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# Search
rs = RandomizedSearchCV(
    CRF, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer
)
rs.fit(X_train, y_train)

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'