In [1]:

%pprint
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))

Pretty printing has been turned OFF


In [10]:

from FRVRS import (nu, re, np, DataFrame, isnan, concat, nan)
import os
import pandas as pd

In [3]:

# load documents
data_frames_dict = nu.load_data_frames(deduped_lower_case_ners_df='deduped_lower_case_ners_df', domain_doc_ners_df='domain_doc_ners_df')
domain_doc_ners_df = data_frames_dict['domain_doc_ners_df']
print(domain_doc_ners_df.shape)
deduped_lower_case_ners_df = data_frames_dict['deduped_lower_case_ners_df']
print(deduped_lower_case_ners_df.shape)
column_descriptions_df = nu.get_column_descriptions(deduped_lower_case_ners_df)

Attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/deduped_lower_case_ners_df.pkl.
Attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/domain_doc_ners_df.pkl.
(302127, 16)
(47269, 18)



# Build a Model to Predict Entities from Domain Documents


## Build a model to predict *is_probe*


#### Create a data frame with all the feature columns

In [4]:

mask_series = deduped_lower_case_ners_df.text_str.isnull()
if mask_series.any():
    deduped_lower_case_ners_df = deduped_lower_case_ners_df[~mask_series]
    print(deduped_lower_case_ners_df.shape)
    nu.store_objects(deduped_lower_case_ners_df=deduped_lower_case_ners_df)
    nu.save_data_frames(deduped_lower_case_ners_df=deduped_lower_case_ners_df)

In [5]:

type_columns = ['ent_type', 'nlp_type', 'bert_entity']
text_columns = ['ent_phrase', 'nlp_word', 'bert_word']

# Get the person/date type mask
type_values = ['PERSON', 'DATE', 'I-PER']
base_mask_series = True
for type_column in type_columns: base_mask_series &= ~domain_doc_ners_df[type_column].isin(type_values)

# Get the null mask
null_mask_series = False
for text_column in text_columns: null_mask_series |= ~domain_doc_ners_df[text_column].isnull()

# Get the non-person/date entities
mask_series = base_mask_series & null_mask_series
non_named_df = domain_doc_ners_df[mask_series]

# Get the is_probe values from the old dataset
def f(text_str):
    mask_series = False
    for text_column in text_columns: mask_series |= non_named_df[text_column].map(lambda x: str(x).lower() == text_str)
    df = non_named_df[mask_series]
    if df.is_probe.nunique() == 1: return df.iloc[0].is_probe
    # elif df.shape[0] == 0: print(text_str); display(df); raise
    elif df.is_probe.any():
        domain_doc_ners_df.loc[df.index, 'is_probe'] = True
        nu.store_objects(domain_doc_ners_df=domain_doc_ners_df)
        nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
        return True
    else: print(text_str); display(df); raise

if 'is_probe' not in deduped_lower_case_ners_df.columns:
    deduped_lower_case_ners_df['is_probe'] = deduped_lower_case_ners_df.text_str.map(f)
    print(deduped_lower_case_ners_df.shape)
    nu.store_objects(deduped_lower_case_ners_df=deduped_lower_case_ners_df)
    nu.save_data_frames(deduped_lower_case_ners_df=deduped_lower_case_ners_df)

In [6]:

# Display a sample of the data frame, dropping columns with all NaN values and transposing it
display(deduped_lower_case_ners_df.sample(min(10, deduped_lower_case_ners_df.shape[0])).dropna(axis='columns', how='all').T)

Unnamed: 0,2,41206,42524,43370,43620,2415,7522,16639,13986,24440
text_str,geneva convention iii relative,quartermaster,1–18,existence of a war for purposes of applying th...,governmentally,misusing,metaethical,91–113,ashford da,drip
ent_phrase,Geneva Convention III Relative,,,EXISTENCE OF A WAR FOR PURPOSES OF APPLYING TH...,,,,,Ashford DA,
ent_type,WORK_OF_ART,,,WORK_OF_ART,,,,,ORG,
ent_start,92815.0,,,38536.0,,,,,91891.0,
ent_end,92845.0,,,38605.0,,,,,91901.0,
nlp_word,,,1–18,,governmentally,,metaethical,91–113,,
nlp_tag,,,CD,,RB,VBG,JJ,NNP,,
nlp_type,,,CARDINAL,,,,,,,
nlp_pofs,,,NUM,,ADV,VERB,ADJ,PROPN,,
is_probe,False,False,False,False,False,False,False,False,False,False



#### One-hot encode it

In [7]:

# Get rid of wierd tag categories
if deduped_lower_case_ners_df.nlp_tag.isin([':', '.', "''", '_SP', ',', 'PRP$', '``', '$', '-LRB-', '-RRB-', 'WP$']).any():
    mask_series = (deduped_lower_case_ners_df.nlp_tag == ':')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'COLON'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == '.')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'FULL_STOP'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == "''")
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'DOUBLE_PRIME'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == '_SP')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'UNDERSCORE_SP'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == ',')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'COMMA'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == 'PRP$')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'PRP_DOLLAR_SIGN'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == '``')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'DOUBLE_BACKTICK'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == '$')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'DOLLAR_SIGN'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == '-LRB-')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'LEFT_ROUND_BRACKET'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == '-RRB-')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'RIGHT_ROUND_BRACKET'
    mask_series = (deduped_lower_case_ners_df.nlp_tag == 'WP$')
    deduped_lower_case_ners_df.loc[mask_series, 'nlp_tag'] = 'WP_DOLLAR_SIGN'
    nu.store_objects(deduped_lower_case_ners_df=deduped_lower_case_ners_df)
    nu.save_data_frames(deduped_lower_case_ners_df=deduped_lower_case_ners_df)

In [8]:

assert deduped_lower_case_ners_df.columns[deduped_lower_case_ners_df.columns.duplicated(keep=False)].shape[0] == 0, "You've got duped columns"

In [16]:

assert len(
    [cn for cn in deduped_lower_case_ners_df.nlp_tag.unique() if re.search('[^A-Z_]+', str(cn)) and not isnan(cn)]
) == 0, "You've got wierd tags, still"

In [17]:

columns_list = pd.get_dummies(deduped_lower_case_ners_df[['nlp_tag']], dummy_na=True).columns
assert columns_list[columns_list.duplicated(keep=False)].shape[0] == 0, "You've got duplicated dummy columns"

In [18]:

# Get the supervised learning and group by columns
input_features = [
    'ent_type', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'bert_entity'
    ]
target_variable = 'is_probe'

In [19]:

# One-hot encode the input features columns in the one-hot encode data frame
if nu.pickle_exists('one_hot_encode_df'): one_hot_encode_df = nu.load_object('one_hot_encode_df')
else:
    ascii_regex = re.compile('[^a-z0-9]+')
    one_hot_encode_df = nu.one_hot_encode(deduped_lower_case_ners_df, input_features)
    one_hot_encode_df = one_hot_encode_df.rename(columns={cn: ascii_regex.sub('_', cn.lower()).strip('_') for cn in one_hot_encode_df.columns})
    
    # Remove the columns with nulls in them
    columns_list = sorted(set(one_hot_encode_df.columns) - set(one_hot_encode_df.dropna(axis='columns', how='any').columns))
    one_hot_encode_df = one_hot_encode_df.drop(columns=columns_list)
    assert one_hot_encode_df.shape == one_hot_encode_df.dropna(axis='index', how='any').shape, "You don't understand how one-hot encoding works"
    
    nu.store_objects(one_hot_encode_df=one_hot_encode_df, verbose=False)
columns_list = [cn for cn in one_hot_encode_df.columns if any(map(lambda x: cn.endswith(x), ['_null', '_nan']))]
print(columns_list)
df = one_hot_encode_df.sample(min(18, one_hot_encode_df.shape[0])).dropna(axis='columns', how='all').T
display(df.sample(min(12, df.shape[0])).sort_index())

['ent_type_nan', 'nlp_tag_nan', 'nlp_type_nan', 'nlp_pofs_nan', 'bert_entity_nan']


Unnamed: 0,28492,28666,1792,32391,41890,43872,45233,35945,19700,2302,360,18687,3510,38403,36201,29428,4515,39324
ent_type_percent,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ent_type_time,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ent_type_work_of_art,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
nlp_pofs_cconj,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
nlp_tag_jj,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
nlp_tag_left_round_bracket,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
nlp_tag_nan,0,0,1,1,1,0,1,0,0,1,1,0,1,0,0,1,0,1
nlp_tag_nfp,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
nlp_tag_vb,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
nlp_tag_wp_dollar_sign,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:

# Verify no duplicate columns
assert one_hot_encode_df.columns[one_hot_encode_df.columns.duplicated(keep=False)].shape[0] == 0, "One hot has duplicate columns"

In [47]:

dropped_columns = ['text_str', 'is_probe', 'lr_is_probe', 'rf_is_probe', 'hgb_is_probe']
sorted(set(one_hot_encode_df.columns) - set(dropped_columns))

['bert_entity_i_loc', 'bert_entity_i_misc', 'bert_entity_i_org', 'bert_entity_nan', 'ent_type_cardinal', 'ent_type_event', 'ent_type_fac', 'ent_type_gpe', 'ent_type_law', 'ent_type_loc', 'ent_type_money', 'ent_type_nan', 'ent_type_norp', 'ent_type_org', 'ent_type_percent', 'ent_type_product', 'ent_type_quantity', 'ent_type_time', 'ent_type_work_of_art', 'nlp_pofs_adj', 'nlp_pofs_adp', 'nlp_pofs_adv', 'nlp_pofs_aux', 'nlp_pofs_cconj', 'nlp_pofs_det', 'nlp_pofs_intj', 'nlp_pofs_nan', 'nlp_pofs_noun', 'nlp_pofs_num', 'nlp_pofs_part', 'nlp_pofs_pron', 'nlp_pofs_propn', 'nlp_pofs_punct', 'nlp_pofs_sconj', 'nlp_pofs_space', 'nlp_pofs_sym', 'nlp_pofs_verb', 'nlp_pofs_x', 'nlp_tag_add', 'nlp_tag_cc', 'nlp_tag_cd', 'nlp_tag_colon', 'nlp_tag_comma', 'nlp_tag_dollar_sign', 'nlp_tag_double_backtick', 'nlp_tag_double_prime', 'nlp_tag_dt', 'nlp_tag_full_stop', 'nlp_tag_fw', 'nlp_tag_hyph', 'nlp_tag_in', 'nlp_tag_jj', 'nlp_tag_jjr', 'nlp_tag_jjs', 'nlp_tag_left_round_bracket', 'nlp_tag_ls', 'nlp_tag_

In [25]:

# Analyze the input features
if nu.pickle_exists('one_hot_column_descriptions_df'): one_hot_column_descriptions_df = nu.load_object('one_hot_column_descriptions_df')
else:
    one_hot_column_descriptions_df = nu.get_column_descriptions(one_hot_encode_df.sample(min(20, one_hot_encode_df.shape[0])))
    nu.store_objects(one_hot_column_descriptions_df=one_hot_column_descriptions_df, verbose=False)
    
display(one_hot_column_descriptions_df.sample(min(10, one_hot_column_descriptions_df.shape[0])).sort_index())

Unnamed: 0,column_name,dtype,count_blanks,count_uniques,count_zeroes,has_dates,min_value,max_value,only_integers
2,ent_type_cardinal,uint8,0,1,20,True,0,0,True
16,ent_type_nan,uint8,0,2,4,True,0,1,True
19,nlp_tag_cd,uint8,0,2,18,True,0,1,True
24,nlp_tag_double_prime,uint8,0,1,20,True,0,0,True
40,nlp_tag_nns,uint8,0,1,20,True,0,0,True
43,nlp_tag_prp_dollar_sign,uint8,0,1,20,True,0,0,True
48,nlp_tag_sym,uint8,0,1,20,True,0,0,True
57,nlp_tag_vbz,uint8,0,1,20,True,0,0,True
77,nlp_type_quantity,uint8,0,1,20,True,0,0,True
83,nlp_pofs_adv,uint8,0,2,19,True,0,1,True



#### Train a classifier on it

In [53]:

# Train a classifier on the data frame
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
df = one_hot_encode_df.dropna(axis='columns', how='all').dropna(axis='index', how='any')
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=[cn for cn in dropped_columns if cn in df.columns]),
    df.is_probe,
    test_size=0.25,
    random_state=42
)

# Convert the uint8 features to floats
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [56]:

# Create a random forest classifier
# The model is biased towards the majority class (False) and struggles to identify the minority class (True)
rf_classifier = RandomForestClassifier(class_weight='balanced')
rf_classifier.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = rf_classifier.predict(X_test)
rf_accuracy = np.mean(y_pred == y_test)

# Calculate the feature importances
feature_importances = rf_classifier.feature_importances_

# Create a data frame to store the feature names and feature importances
feature_importances_df = DataFrame()
feature_importances_df['feature_name'] = df.drop(columns=[cn for cn in dropped_columns if cn in df.columns]).columns
feature_importances_df['feature_importance'] = feature_importances

In [58]:

# Train a Logistic Regression (aka logit, MaxEnt) classifier
# The model is biased towards the majority class (False) and struggles to identify the minority class (True)
lr_classifier = LogisticRegression(class_weight='balanced')
lr_classifier.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = lr_classifier.predict(X_test)
lr_accuracy = np.mean(y_pred == y_test)

# Get the coefficients of the logistic regression model
feature_coefficients = lr_classifier.coef_[0]

# Create a data frame to store the feature names and feature coefficients
feature_coefficients_df = DataFrame()
feature_coefficients_df['feature_name'] = df.drop(columns=[cn for cn in dropped_columns if cn in df.columns]).columns
feature_coefficients_df['feature_coefficient'] = feature_coefficients
feature_coefficients_df['absolute_coefficient'] = feature_coefficients_df.feature_coefficient.map(lambda x: abs(x))

In [60]:

# Train a Histogram-based Gradient Boosting classifier with balanced class weights
# minority_class_mask_series = (y_train == True)
# minority_class_weight = 1
# majority_class_weight = 0.1
# sample_weight=np.where(minority_class_mask_series, minority_class_weight, majority_class_weight)
hgb_classifier = HistGradientBoostingClassifier()
hgb_classifier.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = hgb_classifier.predict(X_test)
hgb_accuracy = np.mean(y_pred == y_test)

# Calculate the feature importances using the Permutation Importance algorithm
from sklearn.inspection import permutation_importance
hgb_permutation_importances = permutation_importance(hgb_classifier, X_test, y_test)

# Create a data frame to store the feature names and feature coefficients
hgb_permutation_importances_df = DataFrame()
hgb_permutation_importances_df['feature_name'] = df.drop(columns=[cn for cn in dropped_columns if cn in df.columns]).columns
for fn in dir(hgb_permutation_importances):
    if (fn == 'importances'): continue
    else: hgb_permutation_importances_df[fn] = eval(f'hgb_permutation_importances.{fn}')

In [61]:

display(hgb_classifier.classes_)

array([False,  True])

In [62]:

display(hgb_classifier.feature_names_in_)

array(['ent_type_cardinal', 'ent_type_event', 'ent_type_fac',
       'ent_type_gpe', 'ent_type_law', 'ent_type_loc', 'ent_type_money',
       'ent_type_norp', 'ent_type_org', 'ent_type_percent',
       'ent_type_product', 'ent_type_quantity', 'ent_type_time',
       'ent_type_work_of_art', 'ent_type_nan', 'nlp_tag_add',
       'nlp_tag_cc', 'nlp_tag_cd', 'nlp_tag_colon', 'nlp_tag_comma',
       'nlp_tag_dollar_sign', 'nlp_tag_double_backtick',
       'nlp_tag_double_prime', 'nlp_tag_dt', 'nlp_tag_full_stop',
       'nlp_tag_fw', 'nlp_tag_hyph', 'nlp_tag_in', 'nlp_tag_jj',
       'nlp_tag_jjr', 'nlp_tag_jjs', 'nlp_tag_left_round_bracket',
       'nlp_tag_ls', 'nlp_tag_md', 'nlp_tag_nfp', 'nlp_tag_nn',
       'nlp_tag_nnp', 'nlp_tag_nnps', 'nlp_tag_nns', 'nlp_tag_pos',
       'nlp_tag_prp', 'nlp_tag_prp_dollar_sign', 'nlp_tag_rb',
       'nlp_tag_rbr', 'nlp_tag_right_round_bracket', 'nlp_tag_rp',
       'nlp_tag_sym', 'nlp_tag_to', 'nlp_tag_uh', 'nlp_tag_underscore_sp',
       'nlp_tag_v

In [63]:

display(hgb_classifier.n_features_in_)

102


#### Evaluate the accuracies and importances

In [64]:

# Print the accuracies
print('RF Accuracy:', rf_accuracy)
print('LR Accuracy:', lr_accuracy)
print('HGB Accuracy:', hgb_accuracy)

# Display the feature importances data frame
display(feature_importances_df.sort_values('feature_importance', ascending=False).head(10))

# Display the feature coefficients data frame
columns_list = ['feature_name', 'feature_coefficient']
display(feature_coefficients_df.sort_values('absolute_coefficient', ascending=False)[columns_list].head(10))

# Display the permutation importances data frame
columns_list = ['feature_name', 'importances_mean']
df = hgb_permutation_importances_df.drop(columns_list, axis='columns')
max_importance = df.max().max()
columns_list += df.columns[df.eq(max_importance).any()].tolist()[:7-len(columns_list)]
display(hgb_permutation_importances_df.sort_values('importances_mean', ascending=False)[columns_list].head(10))

RF Accuracy: 0.6769334912844813
LR Accuracy: 0.6637332882044339
HGB Accuracy: 0.9974614994076832


Unnamed: 0,feature_name,feature_importance
97,nlp_pofs_nan,0.160031
61,nlp_tag_nan,0.133636
14,ent_type_nan,0.085094
8,ent_type_org,0.054013
0,ent_type_cardinal,0.042695
90,nlp_pofs_propn,0.038689
95,nlp_pofs_verb,0.036264
78,nlp_type_nan,0.030506
10,ent_type_product,0.029586
35,nlp_tag_nn,0.025412


Unnamed: 0,feature_name,feature_coefficient
50,nlp_tag_vb,4.507981
60,nlp_tag_xx,4.292139
0,ent_type_cardinal,-3.458741
10,ent_type_product,3.195832
65,nlp_type_gpe,3.12039
62,nlp_type_cardinal,-3.116546
4,ent_type_law,-3.024457
5,ent_type_loc,2.957711
2,ent_type_fac,2.904268
97,nlp_pofs_nan,2.806724


Unnamed: 0,feature_name,importances_mean,importances_std
0,ent_type_cardinal,0.0,0.0
64,nlp_type_fac,0.0,0.0
74,nlp_type_product,0.0,0.0
73,nlp_type_percent,0.0,0.0
72,nlp_type_org,0.0,0.0
71,nlp_type_ordinal,0.0,0.0
70,nlp_type_norp,0.0,0.0
69,nlp_type_money,0.0,0.0
68,nlp_type_loc,0.0,0.0
67,nlp_type_law,0.0,0.0



#### Perform a sample inference

In [65]:

# Display the one-hot encoded sample
input_encode_df = one_hot_encode_df.sample(1)
input_encode_idx = input_encode_df.index.tolist()[0]
print(input_encode_idx)
print(input_encode_df.shape)
mask_series = (input_encode_df.T[input_encode_idx] == 0)
df = input_encode_df.T[~mask_series]
enc_idx_list = df.index.tolist()
display(df)

39412
(1, 107)


Unnamed: 0,39412
text_str,code of ethics for nurses
ent_type_work_of_art,1
nlp_tag_nan,1
nlp_type_nan,1
nlp_pofs_nan,1
bert_entity_nan,1
lr_is_probe,
rf_is_probe,
hgb_is_probe,


In [66]:

# Display the NERs sample
mask_series = (deduped_lower_case_ners_df.index == input_encode_idx)
df = deduped_lower_case_ners_df[mask_series]
print(df.shape)
cn_set = set()
for cn in deduped_lower_case_ners_df.columns:
    for enc_idx in enc_idx_list:
        if enc_idx.startswith(cn): cn_set.add(cn)
columns_list = ['is_probe'] + list(cn_set)
display(df[columns_list].T)

(1, 18)


Unnamed: 0,39412
is_probe,False
nlp_type,
text_str,code of ethics for nurses
ent_type,WORK_OF_ART
lr_is_probe,
bert_entity,
hgb_is_probe,
nlp_tag,
rf_is_probe,
nlp_pofs,


In [67]:

# Convert the input features to a NumPy array
input_features_array = np.array(input_encode_df.drop(columns=dropped_columns).values)
actual_is_probe = input_encode_df.is_probe.squeeze()

In [68]:

# Predict the LR is probe
predicted_is_probe = sorted(
    [(c, p) for c, p in zip(lr_classifier.classes_, lr_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1], reverse=True
)[0][0]
print(f'LR: predicted: {predicted_is_probe}, actual: {actual_is_probe}')

LR: predicted: True, actual: False


In [69]:

# Predict the RF is probe
predicted_is_probe = sorted(
    [(c, p) for c, p in zip(rf_classifier.classes_, rf_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1], reverse=True
)[0][0]
print(f'RF: predicted: {predicted_is_probe}, actual: {actual_is_probe}')

RF: predicted: True, actual: False


In [70]:

# Predict the HGB is probe
predicted_is_probe = sorted(
    [(c, p) for c, p in zip(hgb_classifier.classes_, hgb_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1], reverse=True
)[0][0]
print(f'HGB: predicted: {predicted_is_probe}, actual: {actual_is_probe}')

HGB: predicted: False, actual: False


In [71]:

# Add the predicted columns
one_hot_encode_df['lr_is_probe'] = nan
one_hot_encode_df['rf_is_probe'] = nan
one_hot_encode_df['hgb_is_probe'] = nan
for input_encode_idx, input_encode_series in one_hot_encode_df.iterrows():
    
    # Convert the input features to a NumPy array
    input_features_array = np.array(input_encode_series.drop(index=dropped_columns).values).reshape(1, -1)
    actual_is_probe = input_encode_series.is_probe
    # assert deduped_lower_case_ners_df.loc[input_encode_idx, 'is_probe'] == actual_is_probe, "You don't understand how dataframes work"
    
    # Predict the LR is probe
    lr_is_probe = sorted(
        [(c, p) for c, p in zip(lr_classifier.classes_, lr_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1],
        reverse=True
    )[0][0]
    one_hot_encode_df.loc[input_encode_idx, 'lr_is_probe'] = lr_is_probe
    
    # Predict the RF is probe
    rf_is_probe = sorted(
        [(c, p) for c, p in zip(rf_classifier.classes_, rf_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1],
        reverse=True
    )[0][0]
    one_hot_encode_df.loc[input_encode_idx, 'rf_is_probe'] = rf_is_probe
    
    # Predict the HGB is probe
    hgb_is_probe = sorted(
        [(c, p) for c, p in zip(hgb_classifier.classes_, hgb_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1],
        reverse=True
    )[0][0]
    one_hot_encode_df.loc[input_encode_idx, 'hgb_is_probe'] = hgb_is_probe

In [72]:

nu.store_objects(one_hot_encode_df=one_hot_encode_df)
nu.save_data_frames(one_hot_encode_df=one_hot_encode_df)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/one_hot_encode_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/one_hot_encode_df.csv


In [73]:

sorted(one_hot_encode_df.columns)

['bert_entity_i_loc', 'bert_entity_i_misc', 'bert_entity_i_org', 'bert_entity_nan', 'ent_type_cardinal', 'ent_type_event', 'ent_type_fac', 'ent_type_gpe', 'ent_type_law', 'ent_type_loc', 'ent_type_money', 'ent_type_nan', 'ent_type_norp', 'ent_type_org', 'ent_type_percent', 'ent_type_product', 'ent_type_quantity', 'ent_type_time', 'ent_type_work_of_art', 'hgb_is_probe', 'is_probe', 'lr_is_probe', 'nlp_pofs_adj', 'nlp_pofs_adp', 'nlp_pofs_adv', 'nlp_pofs_aux', 'nlp_pofs_cconj', 'nlp_pofs_det', 'nlp_pofs_intj', 'nlp_pofs_nan', 'nlp_pofs_noun', 'nlp_pofs_num', 'nlp_pofs_part', 'nlp_pofs_pron', 'nlp_pofs_propn', 'nlp_pofs_punct', 'nlp_pofs_sconj', 'nlp_pofs_space', 'nlp_pofs_sym', 'nlp_pofs_verb', 'nlp_pofs_x', 'nlp_tag_add', 'nlp_tag_cc', 'nlp_tag_cd', 'nlp_tag_colon', 'nlp_tag_comma', 'nlp_tag_dollar_sign', 'nlp_tag_double_backtick', 'nlp_tag_double_prime', 'nlp_tag_dt', 'nlp_tag_full_stop', 'nlp_tag_fw', 'nlp_tag_hyph', 'nlp_tag_in', 'nlp_tag_jj', 'nlp_tag_jjr', 'nlp_tag_jjs', 'nlp_tag_l

In [74]:

one_hot_encode_df.is_probe.unique()

array([False,  True])

In [75]:

one_hot_encode_df.lr_is_probe.unique()

array([True, False], dtype=object)

In [76]:

one_hot_encode_df.rf_is_probe.unique()

array([True, False], dtype=object)

In [77]:

one_hot_encode_df.hgb_is_probe.unique()

array([False], dtype=object)

In [78]:

mask_series = False
for cn in ['lr_is_probe', 'rf_is_probe', 'hgb_is_probe']:
    mask_series = ~one_hot_encode_df.is_probe.isnull() & ~one_hot_encode_df[cn].isnull()
    mask_series &= (one_hot_encode_df[cn] != one_hot_encode_df.is_probe)
    df = one_hot_encode_df[mask_series]
    if df.shape[0]:
        display(df.sample(min(11, df.shape[0])).dropna(axis='columns', how='all').T)
        break

Unnamed: 0,43210,40183,37002,273,3089,2818,8690,14115,5805,40594,46089
text_str,a person responsible for his or her subordinates,m carey & sons,refining,early,—united states,reactive skin decontamination lotion,"washington, dc\n",the un general assembly,rome statute party,university of california at san francisco,perform descriptive epidemiology \ninformation
is_probe,False,False,False,False,False,False,False,False,False,False,False
ent_type_cardinal,0,0,0,0,0,0,0,0,0,0,0
ent_type_event,0,0,0,0,0,0,0,0,0,0,0
ent_type_fac,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
bert_entity_i_org,0,0,0,0,0,0,0,0,0,0,0
bert_entity_nan,1,1,1,1,1,1,1,1,1,1,1
lr_is_probe,True,True,True,True,True,True,True,True,True,True,True
rf_is_probe,True,True,True,True,False,True,False,True,True,True,True


In [79]:

columns_list = ['is_probe', 'lr_is_probe', 'rf_is_probe', 'hgb_is_probe']
df = one_hot_encode_df.drop_duplicates(subset=columns_list)
if df.shape[0]: display(df.sample(min(14, df.shape[0])).dropna(axis='columns', how='all').T)

Unnamed: 0,24,135,3,0,204,2317
text_str,"washington, dc: \ngovernment printing office",needleless,1959;59(1):1–9,healing,current resources - yes\nimmediate\nlikely,pneumatic
is_probe,False,True,False,False,True,True
ent_type_cardinal,0,0,0,0,0,0
ent_type_event,0,0,0,0,0,0
ent_type_fac,0,0,0,0,0,0
...,...,...,...,...,...,...
bert_entity_i_org,0,0,0,0,0,0
bert_entity_nan,1,1,1,1,1,1
lr_is_probe,True,False,False,True,True,True
rf_is_probe,False,False,False,True,True,False


In [86]:

mask_series = False
for cn in ['lr_is_probe', 'rf_is_probe', 'hgb_is_probe']: mask_series |= (one_hot_encode_df[cn] == True)
mask_series &= (one_hot_encode_df.is_probe == False)
df = one_hot_encode_df[mask_series]
if df.shape[0]:
    print(len(df.text_str.tolist()))
    display(df.sample(min(14, df.shape[0])).dropna(axis='columns', how='all').T)

15760


Unnamed: 0,1356,24539,23304,613,14278,26249,19057,37614,6783,14332,27780,31455,23691,20539
text_str,1983–1984,uniformed services \nuniversity of health scie...,the emancipation proclamation,”1(p33,malaria,concise,insertion,boston marathon,algorithms,largest,− reassess,the international committee of the red\nfig,el al45,hastings cent
is_probe,False,False,False,False,False,False,False,False,False,False,False,False,False,False
ent_type_cardinal,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ent_type_event,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ent_type_fac,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bert_entity_i_org,0,0,0,0,0,0,0,0,0,0,0,0,0,0
bert_entity_nan,1,1,1,1,1,1,1,1,1,1,1,1,1,1
lr_is_probe,True,True,True,True,True,True,True,True,True,True,True,True,True,True
rf_is_probe,True,True,True,True,True,True,True,True,True,True,True,True,False,True
