In [1]:

# Set up notebook
%pprint
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))

Pretty printing has been turned OFF


In [2]:

# load libraries
from FRVRS import (nu, walk, nan, DataFrame, isnan, display)


# Build a Model to Predict Military Membership

In [3]:

# Get a list of stored dataframes
df_name_set = set()
for sub_directory, directories_list, files_list in walk(nu.github_folder):
    for file_name in files_list:
        parts_list = file_name.split('.')
        file_extension = parts_list[-1]
        if file_extension in ['csv', 'pkl']:
            
            # load data frame
            df_name = '.'.join(parts_list[:-1])
            if df_name.endswith('_df'):
                df_name_set.add(df_name)
df_names_list = sorted(df_name_set)

In [4]:

# Aggregate all patient IDs
patient_id_set = set()
for df_name in df_names_list:
    data_frames_dict = nu.load_data_frames(verbose=False, **{df_name: ''})
    df = data_frames_dict[df_name]
    if 'patient_id' in df.columns:
        mask_series = ~df.patient_id.isnull()
        patient_id_set.update(df[mask_series].patient_id.unique())
patient_ids_list = sorted(patient_id_set)


### Vectorize a corpus of patient IDs using the bag of words model

In [5]:

# Create a data frame with all the feature columns, labelling as much as you can
rows_list = []
for patient_id in patient_ids_list:
    if any(map(lambda x: x in patient_id.lower(), ['civilian'])):
        rows_list.append({'patient_id': patient_id, 'is_military': 0})
    elif any(map(lambda x: x in patient_id.lower(), ['officer', 'soldier', 'marine', 'military', 'navy'])):
        rows_list.append({'patient_id': patient_id, 'is_military': 1})
    else:
        rows_list.append({'patient_id': patient_id, 'is_military': nan})
training_df = DataFrame(rows_list)
training_df.sample(5)

Unnamed: 0,patient_id,is_military
6,Bob_1 Root,
135,electrician Root,
125,Simulation,
39,Intelligence Officer Root,1.0
37,Helga_9 Root,


In [6]:

# Build a corpus with the words in the patient IDs
corpus = []
for patient_id in training_df.patient_id:
    corpus.append(patient_id.split(' '))

In [9]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from humanize.number import intword

# Vectorize a corpus of patient ID parts using the bag of words model
cv = CountVectorizer(**{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'lowercase': False, 'max_df': 1.0,
                        'max_features': None, 'min_df': 0.0, 'ngram_range': (1, 5), 'stop_words': None, 'strip_accents': 'ascii'})
bow_matrix = cv.fit_transform([item for sublist in corpus for item in sublist])
print(f'bow_matrix.shape: ({intword(bow_matrix.shape[0])} rows, {intword(bow_matrix.shape[1])} columns)')

# Vectorize a corpus of patient ID parts using the tf-idf model
tt = TfidfTransformer(**{'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': False, 'use_idf': True})
tfidf_matrix = tt.fit_transform(bow_matrix)
print(f'tfidf_matrix.shape: ({intword(tfidf_matrix.shape[0])} rows, {intword(tfidf_matrix.shape[1])} columns)')

bow_matrix.shape: (442 rows, 96 columns)
tfidf_matrix.shape: (442 rows, 96 columns)



#### Train a model on it

In [8]:

# Get the supervised learning and group by columns
input_features = ['patient_id']
target_variable = 'is_military'

In [10]:

# Apply TF-IDF weighting to the the vocabulary and patient ID frequencies
columns_list = [target_variable] + input_features
df = training_df[columns_list].dropna(axis='index', how='any')

train_data_list = df.patient_id.tolist()
train_labels_list = df.is_military.values
X_train_counts = cv.fit_transform(train_data_list)
X_train_tfidf = tt.fit_transform(X_train_counts)

In [15]:

# Loop through different hyperparameter combinations to compare how closely the classifier meets the criterion
from sklearn.linear_model import SGDClassifier

# Force our best classifier to recapitulate its civilian predictions
meets_criterion_df = DataFrame([], columns=[
    'loss_function', 'penalty', 'l1_ratio', 'civilian_high_bar', 'military_low_bar',
    'predict_proba_max', 'predict_proba_min', 'classifier_meets_criterion'
])
while is_military_df.predict_proba.min() > 0.0:
    
    # Loop through different hyperparameter combinations
    rows_list = []
    for loss_function in ['modified_huber']:  # Add more loss functions (e.g., 'log') if desired
        for penalty in ['l2', 'l1']:  # Add more penalties (e.g., 'elasticnet') if desired
            for l1_ratio in [0.925, 0.95, 0.975]:
                try:
                    
                    # Create a classifier with current hyperparameters
                    classifier = SGDClassifier(loss=loss_function, penalty=penalty, l1_ratio=l1_ratio)
                    
                    # Train the classifier on the initial training data
                    classifier.fit(X_train_tfidf, train_labels_list)
                    
                    # Loop through each patient's data grouped by patient ID
                    for sample_patient_id, patients_df in training_df.groupby('patient_id'):
                        
                        # Prepare the test data for this patient (transform the ID using cv)
                        X_test = tt.transform(cv.transform([sample_patient_id])).toarray()
                        
                        # Predict the probability of being military for this patient
                        predict_proba = classifier.predict_proba(X_test)[0][1]
                        
                        # Update the predict_proba column for this patient's data
                        training_df.loc[patients_df.index, 'predict_proba'] = predict_proba
                    
                    # Loop through groups of military status (0.0, 1.0, NaN)
                    row_dict = {'loss_function': loss_function, 'penalty': penalty, 'l1_ratio': l1_ratio}
                    for is_military, is_military_df in training_df.groupby('is_military', dropna=False):
                        
                        # Calculate the highest predicted probability for the civilian group (is_military=0.0) as the low bar
                        if is_military == 0.0:
                            civilian_high_bar = is_military_df.predict_proba.max()
                            row_dict['civilian_high_bar'] = civilian_high_bar
                        
                        # Calculate the lowest predicted probability for the military group (is_military=1.0) as the high bar
                        elif is_military == 1.0:
                            military_low_bar = is_military_df.predict_proba.min()
                            row_dict['military_low_bar'] = military_low_bar
                        
                        # Analyze predictions for the unknown group (is_military=NaN)
                        elif isnan(is_military):
                            predict_proba_max = is_military_df.predict_proba.max()
                            row_dict['predict_proba_max'] = predict_proba_max
                            predict_proba_min = is_military_df.predict_proba.min()
                            row_dict['predict_proba_min'] = predict_proba_min
                            
                            # Check if the classifier meets the criterion (all unknown predictions between civilian
                            # high bar and military low bar)
                            classifier_meets_criterion = (
                                (predict_proba_max >= military_low_bar) and (predict_proba_min <= civilian_high_bar)
                            )
                            row_dict['classifier_meets_criterion'] = classifier_meets_criterion
                    
                    rows_list.append(row_dict)
                except Exception as e:
                    print(
                        f'{e.__class__.__name__} error using {loss_function} with a {penalty} penalty and'
                        f' l1_ratio={l1_ratio}: {str(e).strip()}'
                    )
    
    # Create a Data Frame from the list of results for further analysis
    meets_criterion_df = DataFrame(rows_list)

In [16]:

# Sort the best candidates to the top
meets_criterion_df.sort_values(['predict_proba_min', 'classifier_meets_criterion'])

Unnamed: 0,loss_function,penalty,l1_ratio,civilian_high_bar,military_low_bar,predict_proba_max,predict_proba_min,classifier_meets_criterion
1,modified_huber,l2,0.95,0.029012,0.956603,1.0,0.0,True
5,modified_huber,l1,0.975,0.024683,0.97529,1.0,0.0,True
0,modified_huber,l2,0.925,0.01638,0.953674,1.0,0.756569,False
4,modified_huber,l1,0.95,0.026136,0.968709,1.0,0.779568,False
2,modified_huber,l2,0.975,0.030551,0.973914,1.0,0.800428,False
3,modified_huber,l1,0.925,0.028547,0.966478,1.0,0.802924,False


In [17]:

# Force our best classifier to recapitulate its civilian predictions
while is_military_df.predict_proba.min() > 0.0:
    classifier = SGDClassifier(loss='modified_huber', penalty='l2', l1_ratio=0.975)
    
    # Train on initial data
    classifier.fit(X_train_tfidf, train_labels_list)
    
    # Loop through every patient ID
    for sample_patient_id, patients_df in training_df.groupby('patient_id'):
        
        # Predict if its military
        X_test = tt.transform(cv.transform([sample_patient_id])).toarray()
        predict_proba = classifier.predict_proba(X_test)[0][1]
        
        # Store the prediction in the predict_proba column
        training_df.loc[patients_df.index, 'predict_proba'] = predict_proba
    
    mask_series = training_df.is_military.isnull()
    is_military_df = training_df[mask_series]
display(is_military_df.predict_proba.describe())
display(is_military_df.sort_values('predict_proba', ascending=True).head())
display(is_military_df.sort_values('predict_proba', ascending=False).head())

count    87.000000
mean      0.206144
std       0.357928
min       0.000000
25%       0.000000
50%       0.000000
75%       0.394799
max       1.000000
Name: predict_proba, dtype: float64

Unnamed: 0,patient_id,is_military,predict_proba
66,Mike_10 Root,,0.0
44,Lily_2 Root,,0.0
45,Lily_4 Root,,0.0
46,Lily_5 Root,,0.0
47,Lily_7 Root,,0.0


Unnamed: 0,patient_id,is_military,predict_proba
131,Urban Level Core,,1.0
21,Desert Level Core,,1.0
63,Mike Root,,0.932175
0,Adept Shooter,,0.789597
81,NPC,,0.789597



## Maintenance

In [None]:

nu.store_objects(is_military_classifier=classifier, is_military_tt=tt, is_military_cv=cv)

In [None]:

for df_name in df_names_list:
    data_frames_dict = nu.load_data_frames(verbose=False, **{df_name: ''})
    df = data_frames_dict[df_name]
    if (df is not None) and ('patient_id' in df.columns):
        print(df_name)