In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
if ('../py' not in sys.path): sys.path.insert(1, '../py')

In [3]:

%matplotlib inline
from datetime import datetime
from neo4j.exceptions import ServiceUnavailable
import humanize
import matplotlib.pyplot as plt
import time
import winsound

bin_count = 12
duration = 1000  # milliseconds
freq = 880  # Hz
height_inches = 3.0
width_inches = 18.0

In [4]:

t0 = time.time()

# Get the Neo4j driver
from storage import Storage
s = Storage()

from ha_utils import HeaderAnalysis
ha = HeaderAnalysis(s=s, verbose=False)

from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(s=s)
uri = wsu.secrets_json['neo4j']['connect_url']
user =  wsu.secrets_json['neo4j']['username']
password = wsu.secrets_json['neo4j']['password']

# Get the neo4j object
from cypher_utils import CypherUtilities
cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)

try:
    
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
    
    from hc_utils import HeaderCategories
    hc = HeaderCategories(cu=cu, verbose=False)
    
    # Vary the sampling strategy limit so that the overall creation time is less than two minutes
    from lr_utils import LrUtilities
    lru = LrUtilities(ha=ha, cu=cu, hc=hc, verbose=False)
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=10_000, verbose=False)
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\job-hunting\saves\pkl\basic_quals_dict.pkl
Utility libraries created in 1 minute and 32 seconds
Last run on 2023-03-02 14:27:14.021955



---
# Load needed libraries and functions


---
# Training

In [5]:

# Rebuild the datframe from the database
import pandas as pd

def do_cypher_tx(tx, verbose=False):
    cypher_str = '''
        MATCH (qs:QualificationStrings)
        RETURN
            qs.qualification_str AS qualification_str,
            qs.is_qualified AS is_qualified;
        '''
    results_list = tx.run(query=cypher_str)

    return [dict(record.items()) for record in results_list]
with cu.driver.session() as session:
    row_objs_list = session.write_transaction(do_cypher_tx, verbose=False)
quals_df = pd.DataFrame(row_objs_list)
quals_df.is_qualified = quals_df.is_qualified.map(lambda x: bool(x))

In [6]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

# Re-transform the bag-of-words and tf-idf from the new manual scores
sents_list = quals_df.qualification_str.tolist()
assert len(sents_list)

# Bag-of-words
cv = CountVectorizer(**{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'lowercase': False, 'max_df': 1.0,
                        'max_features': None, 'min_df': 0.0, 'ngram_range': (1, 5), 'stop_words': None, 'strip_accents': 'ascii',
                        'tokenizer': ha.html_regex_tokenizer})
bow_matrix = cv.fit_transform(sents_list)

# Tf-idf, must get from BOW first
tt = TfidfTransformer(**{'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': False, 'use_idf': True})
tfidf_matrix = tt.fit_transform(bow_matrix)

# Re-train the classifier
X = tfidf_matrix
y = quals_df.is_qualified.to_numpy()
# Best score: 0.850
basic_quals_clf = LogisticRegression(C=10.0, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                                    max_iter=1000, multi_class='auto', n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
                                    tol=0.0001, verbose=0, warm_start=False)
basic_quals_clf.fit(X, y)

# Re-calibrate the inference engine
bq_cv_vocab = cv.vocabulary_
bq_cv = CountVectorizer(vocabulary=bq_cv_vocab)
bq_cv._validate_vocabulary()
CLF_NAME = 'LogisticRegression'
print('Retraining complete')

Retraining complete



---
# Rescore the quals dataframe


## Add the Standard Models to the Fit Estimators and Training Durations List

In [7]:

import numpy as np

counts_dict = quals_df.groupby('is_qualified').count().qualification_str.to_dict()
np.array([counts_dict[False]/(counts_dict[False]+counts_dict[True]), counts_dict[True]/(counts_dict[False]+counts_dict[True])])

array([0.44760292, 0.55239708])

In [None]:

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC

# Get the models
t0 = time.time()
estimators_list = [
                   # done in 3.035s
                   # Best score: 0.734
                   AdaBoostClassifier(algorithm='SAMME.R', learning_rate=1.0, n_estimators=5, random_state=None),

                   # done in 332.301s
                   # Best score: 0.814
                   BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False, max_features=0.5, max_samples=0.75,
                                     n_estimators=10, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False),

                   # done in 221.215s
                   # Best score: 0.683
                   ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=100,
                                        max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0,
                                        min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100,
                                        n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False),

                   # done in 135.438s
                   # Best score: 0.698
                   GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init='zero', learning_rate=1.0, loss='deviance',
                                              max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0,
                                              min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=1000,
                                              n_iter_no_change=None, random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1,
                                              verbose=0, warm_start=False),

                   # done in 289.938s
                   # Best score: 0.779
                   RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='entropy', max_depth=None,
                                          max_features=None, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0,
                                          min_samples_leaf=2, min_samples_split=2, min_weight_fraction_leaf=0.0,
                                          n_estimators=1000, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False),

                   # done in 13.410s
                   # Best score: 0.850
                   LogisticRegression(C=10.0, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                                      max_iter=1000, multi_class='auto', n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
                                      tol=0.0001, verbose=0, warm_start=False),

                   # done in 6.014s
                   # Best score: 0.764
                   SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3,
                       gamma='scale', kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
                       verbose=False),
                   ]

# Fit the data and add the duration and fitted models to lists
fit_estimators_list = []
training_durations_list = []
for clf in estimators_list:
    start_time = time.time()
    fit_estimators_list.append(clf.fit(X, y))
    stop_time = time.time()
    training_durations_list.append(stop_time - start_time)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
winsound.Beep(freq, duration)
print(f'Estimators list created in {duration_str}')

In [None]:
raise


## Add the LDA Model to the Fit Estimators and Training Durations List

In [None]:

from sklearn.decomposition import LatentDirichletAllocation

# Train the model on the corpus
lda = LatentDirichletAllocation(n_components=2, doc_topic_prior=None, topic_word_prior=None, learning_method='batch',
                                learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=-1,
                                total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None,
                                verbose=0, random_state=None)
lda.fit(X)


## Create the Inference Durations List

In [None]:

inference_durations_list = []
quals_df[clf_name] = np.nan
for clf in fit_estimators_list:
    clf_name = str(type(clf)).split('.')[-1].split("'")[0]
    start_time = time.time()
    for row_index, row_series in quals_df.iterrows():
        qualification_str = row_series.qualification_str
        if(clf_name == 'LdaModel'):
            X_test = headers_dictionary.doc2bow(ha.html_regex_tokenizer(qualification_str))
            result_list = lda[X_test]
            if len(result_list) == 1:
                result_tuple = result_list[0]
            elif len(result_list) == 2:
                result_tuple = result_list[1]
                
            # Assume it's the probability of the larger topic
            y_predict_proba = 1.0 - result_tuple[1]
            
        else:
            X_test = bq_tt.transform(bq_cv.transform([qualification_str])).toarray()
            y_predict_proba = clf.predict_proba(X_test)[0][1]
        quals_df.loc[row_index, clf_name] = y_predict_proba
    stop_time = time.time()
    inference_durations_list.append(stop_time - start_time)
s.store_objects(quals_df=quals_df, inference_durations_list=inference_durations_list)

In [None]:

clf_name


## Add the Stacking Classifier to the Estimators, Training and Durations List

In [None]:

clf = StackingClassifier(estimators=[(str(type(e)).split('.')[-1].split("'")[0], e) for e in estimators_list],
                         final_estimator=None, cv=None, stack_method='auto', n_jobs=None, passthrough=False, verbose=0)
clf_name = str(type(clf)).split('.')[-1].split("'")[0]
quals_df = s.load_object('quals_df')
quals_df[clf_name] = np.nan
bq_cv_vocab = s.load_object('bq_cv_vocab')
bq_cv = CountVectorizer(vocabulary=bq_cv_vocab)
bq_cv._validate_vocabulary()
bq_tt = s.load_object('bq_tt')
X = bq_tt.transform(bq_cv.transform(quals_df.qualification_str.tolist())).toarray()
y = quals_df.is_qualified.to_numpy()
start_time = time.time()
fit_estimators_list.append(clf.fit(X, y))
stop_time = time.time()
training_durations_list = s.load_object('training_durations_list')
training_durations_list.append(stop_time - start_time)
s.store_objects(fit_estimators_list=fit_estimators_list, training_durations_list=training_durations_list)

# Re-score the quals dataframe
inference_durations_list = s.load_object('inference_durations_list')
start_time = time.time()
for row_index, row_series in quals_df.iterrows():
    qualification_str = row_series.qualification_str
    X_test = bq_tt.transform(bq_cv.transform([qualification_str])).toarray()
    y_predict_proba = clf.predict_proba(X_test)[0][1]
    quals_df.loc[row_index, clf_name] = y_predict_proba
stop_time = time.time()
inference_durations_list.append(stop_time - start_time)
s.store_objects(quals_df=quals_df, inference_durations_list=inference_durations_list)


## Add the Voting Classifier to the Estimators, Training and Durations List

In [None]:

clf = VotingClassifier(estimators=[(str(type(e)).split('.')[-1].split("'")[0], e) for e in estimators_list],
                       voting='soft', weights=None, n_jobs=None, flatten_transform=True)
clf_name = str(type(clf)).split('.')[-1].split("'")[0]
quals_df[clf_name] = np.nan
fit_estimators_list = s.load_object('fit_estimators_list')
start_time = time.time()
fit_estimators_list.append(clf.fit(X, y))
stop_time = time.time()
training_durations_list = s.load_object('training_durations_list')
training_durations_list.append(stop_time - start_time)
s.store_objects(fit_estimators_list=fit_estimators_list, training_durations_list=training_durations_list)

# Re-score the html quals dataframe for prediction comparisons
bq_cv_vocab = s.load_object('bq_cv_vocab')
bq_cv = CountVectorizer(vocabulary=bq_cv_vocab)
bq_cv._validate_vocabulary()
bq_tt = s.load_object('bq_tt')
inference_durations_list = s.load_object('inference_durations_list')
start_time = time.time()
for row_index, row_series in quals_df.iterrows():
    qualification_str = row_series.qualification_str
    X_test = bq_tt.transform(bq_cv.transform([qualification_str])).toarray()
    y_predict_proba = clf.predict_proba(X_test)[0][1]
    quals_df.loc[row_index, clf_name] = y_predict_proba
stop_time = time.time()
inference_durations_list.append(stop_time - start_time)
s.store_objects(quals_df=quals_df, inference_durations_list=inference_durations_list)

In [None]:

# Create and store a dictionary of all the fitted classifiers
fit_estimators_list = s.load_object('fit_estimators_list')
fit_estimators_dict = {str(type(clf)).split('.')[-1].split("'")[0]: clf for clf in fit_estimators_list}
s.store_objects(FIT_ESTIMATORS_DICT=fit_estimators_dict)

In [None]:

print(quals_df.columns.tolist())
quals_df.sample(5).T

In [None]:

metrics_list = ['accuracy_score', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision_score',
                'balanced_accuracy_score', 'cohen_kappa_score', 'completeness_score', 'explained_variance_score',
                'f1_score', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard_score', 'mutual_info_score',
                'normalized_mutual_info_score', 'precision_score', 'r2_score', 'recall_score', 'roc_auc_score', 'v_measure_score']
exec('from sklearn.metrics import {}'.format(', '.join(metrics_list)))
fit_estimators_list = s.load_object('fit_estimators_list')
clf_name_list = [str(type(clf)).split('.')[-1].split("'")[0] for clf in fit_estimators_list]
quals_df = s.load_object('quals_df')
y_true = quals_df.is_qualified.tolist()
fit_match_series = (quals_df.is_qualified == True)
yes_list = quals_df[fit_match_series].is_qualified.tolist()
no_list = quals_df[~fit_match_series].is_qualified.tolist()
columns_list = ['clf_name', 'training_duration', 'inference_duration', 'boundary_diff', 'clf_yes_entropy',
                'relative_yes_entropy'] + metrics_list
rows_list = []
training_durations_list = s.load_object('training_durations_list')
inference_durations_list = s.load_object('inference_durations_list')
for column_name, training_duration, inference_duration in zip(clf_name_list, training_durations_list, inference_durations_list):
    yes_series = quals_df[fit_match_series][column_name]
    upper_bound = yes_series.min()
    no_series = quals_df[~fit_match_series][column_name]
    lower_bound = no_series.max()
    y_pred = []
    for p in quals_df[column_name]:
        if p > 0.5:
            y_pred.append(1)
        else:
            y_pred.append(0)
    row_dict = {}
    row_dict['clf_name'] = column_name
    row_dict['training_duration'] = training_duration
    row_dict['inference_duration'] = inference_duration
    row_dict['boundary_diff'] = upper_bound-lower_bound
    row_dict['clf_yes_entropy'] = entropy(pk=yes_series.tolist(), base=2)
    row_dict['relative_yes_entropy'] = entropy(pk=yes_list, qk=yes_series.tolist(), base=2)
    for metric_str in metrics_list:
        try:
            row_dict[metric_str] = eval('{}(y_true, quals_df[column_name].tolist())'.format(metric_str))
        except Exception as e1:
            try:
                row_dict[metric_str] = eval('{}(y_true, y_pred)'.format(metric_str))
            except Exception as e2:
                row_dict[metric_str] = np.nan
    rows_list.append(row_dict)
entropy_df = pd.DataFrame(rows_list, columns=columns_list).dropna(axis='columns', how='all')
entropy_df.set_index('clf_name', drop=True, inplace=True)
s.store_objects(entropy_df=entropy_df)

In [None]:

description_dict = {name: fn.__doc__.strip().split('\n')[0] for name, fn in inspect.getmembers(sys.modules[__name__],
                                                                                               inspect.isfunction) if name in metrics_list}
assert s.pickle_exists('entropy_df')
entropy_df = s.load_object('entropy_df')
for name, cls in inspect.getmembers(sys.modules[__name__], inspect.isclass):
    if name in entropy_df.index:
        description_dict[name] = cls.__doc__.strip().split('\n')[0]
s.store_objects(metrics_list=metrics_list, description_dict=description_dict)

In [None]:

print(entropy_df.columns.tolist())
metrics_list

In [None]:

# 1 - second topic: LdaModel	3.973441	1.990886	0.798208	-0.819588
# 1 - first topic:  LdaModel	3.973441	2.222728	0.211814	-4.210144
# first topic:      LdaModel	3.973441	2.371093	0.787072	-0.987907
# second topic:     LdaModel	3.973441	2.371093	0.200678	-4.379083
columns_list = ['training_duration', 'inference_duration', 'balanced_accuracy_score', 'r2_score']
entropy_df[columns_list].sort_values('balanced_accuracy_score', ascending=False)

In [None]:

description_dict = s.load_object('description_dict')
if 'training_duration' not in description_dict:
    description_dict['training_duration'] = 'The average training time in seconds'
    s.store_objects(description_dict=description_dict)
if 'inference_duration' not in description_dict:
    description_dict['inference_duration'] = 'The average inference time in seconds'
    s.store_objects(description_dict=description_dict)
if 'clf_yes_entropy' not in description_dict:
    description_dict['clf_yes_entropy'] = 'The entropy of the distribution for True probability values'
    s.store_objects(description_dict=description_dict)


----

In [None]:

# Assume the entropy dataframe has been populated
entropy_df = s.load_object('entropy_df')
assert entropy_df.shape[0]
description_dict = s.load_object('description_dict')
columns_list = ['training_duration', 'inference_duration']
for metric in columns_list:
    if metric in description_dict:
        print('{}: {}'.format(metric, description_dict[metric]))
fig = plt.figure(figsize=(18, 8))
ax = fig.add_subplot(1, 1, 1)
AxesSubplot_obj = entropy_df.sort_values(columns_list[0], ascending=True)[columns_list].plot.bar(rot=45, ax=ax)

In [None]:

# Assume the entropy dataframe has been populated
entropy_df = s.load_object('entropy_df')
assert entropy_df.shape[0]
description_dict = s.load_object('description_dict')
columns_list = ['balanced_accuracy_score', 'r2_score']
for metric in columns_list:
    if metric in description_dict:
        print('{}: {}'.format(metric, description_dict[metric]))
fig = plt.figure(figsize=(18, 8))
ax = fig.add_subplot(1, 1, 1)
AxesSubplot_obj = entropy_df.sort_values(columns_list[0], ascending=True)[columns_list].plot.bar(rot=45, ax=ax)

In [None]:

# Assume the entropy dataframe has been populated
entropy_df = s.load_object('entropy_df')
assert entropy_df.shape[0]
description_dict = s.load_object('description_dict')
columns_list = ['training_duration', 'inference_duration', 'balanced_accuracy_score']
for metric in columns_list:
    if metric in description_dict:
        print('{}: {}'.format(metric, description_dict[metric]))
fig = plt.figure(figsize=(18, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_yscale('log')
AxesSubplot_obj = entropy_df.sort_values(columns_list[0], ascending=True)[columns_list].plot.bar(rot=45, ax=ax)

In [None]:

entropy_df = s.load_object('entropy_df')
assert entropy_df.shape[0]
metrics_list = s.load_object('metrics_list')
custom_metrics_list = ['boundary_diff', 'clf_yes_entropy', 'relative_yes_entropy']
columns_list = metrics_list + custom_metrics_list
columns_list = [cn for cn, s in sorted([(cn, entropy_df[cn].std()) for cn in columns_list], key=lambda x: x[1], reverse=True)][:3]
description_dict = s.load_object('description_dict')
for metric in columns_list:
    if metric in description_dict:
        print('{}: {}'.format(metric, description_dict[metric]))
df = entropy_df.sort_values(columns_list[0], ascending=True)[columns_list]
AxesSubplot_obj = df.plot.bar(rot=45, figsize=(18, 8))

In [None]:

from IPython.display import display

row_dict = {}
for column_name in df.columns:
    row_dict[column_name] = df[column_name].std()
display(df.append(pd.DataFrame([row_dict], index=['Standard Deviation'])))

In [None]:

assert entropy_df.shape[0]
metrics_list = s.load_object('metrics_list')
columns_list = [cn for cn in metrics_list if 'accur' in cn.lower()]
for metric in columns_list:
    if metric in description_dict:
        print('{}: {}'.format(metric, description_dict[metric]))
AxesSubplot_obj = entropy_df.sort_values(columns_list[0], ascending=True)[columns_list].plot.bar(rot=45, figsize=(18, 8))

In [None]:

entropy_df.sort_values('boundary_diff', ascending=False)[custom_metrics_list]

In [None]:

assert entropy_df.shape[0]
for metric in custom_metrics_list:
    if metric in description_dict:
        print('{}: {}'.format(metric, description_dict[metric]))
AxesSubplot_obj = entropy_df.sort_values('boundary_diff', ascending=True)[custom_metrics_list].plot.bar(rot=45, figsize=(18, 8))

In [None]:

entropy_df = s.load_object('entropy_df')
assert entropy_df.shape[0]
columns_list = ['average_precision_score', 'precision_score', 'recall_score']
description_dict = s.load_object('description_dict')
for metric in columns_list:
    print('{}: {}'.format(metric, description_dict[metric]))
AxesSubplot_obj = entropy_df.sort_values('precision_score', ascending=True)[columns_list].plot.bar(rot=45, figsize=(18, 8))

In [None]:

columns_list = ['average_precision_score', 'precision_score', 'recall_score']
extended_columns_list = ['training_duration', 'inference_duration'] + columns_list
entropy_df.sort_values('precision_score', ascending=True)[extended_columns_list]


----

In [None]:

[(str(type(e)).split('.')[-1].split("'")[0], e) for e in fit_estimators_list]

In [None]:

idx = 'LdaModel'
mask_series = (entropy_df.index == idx)
entropy_df[mask_series].T.to_dict()[idx]

In [None]:

clf_name = 'LdaModel'
FIT_ESTIMATORS_DICT = s.load_object('FIT_ESTIMATORS_DICT')
clf = FIT_ESTIMATORS_DICT[clf_name]
quals_df[clf_name] = np.nan
for row_index, row_series in quals_df.iterrows():
    qualification_str = row_series.qualification_str
    X_test = headers_dictionary.doc2bow(ha.html_regex_tokenizer(qualification_str))
    result_list = lda[X_test]
    if len(result_list) == 1:
        result_tuple = result_list[0]
    elif len(result_list) == 2:
        
        # Assume it's the first topic
        result_tuple = result_list[0]
        
    y_predict_proba = result_tuple[1]
    
    quals_df.loc[row_index, clf_name] = y_predict_proba
s.store_objects(quals_df=quals_df)