In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from notebook_utils import NotebookUtilities
from pandas import DataFrame
import os
import os.path as osp
import re
from sklearn.feature_extraction.text import TfidfVectorizer

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)

In [3]:

# Get a list of notebook names
black_list = ['.ipynb_checkpoints', '$Recycle.Bin']
this_folder = osp.dirname(osp.abspath(osp.curdir))

# List of documents
documents = []

# List of words for each document
words_list = []

import_regex = re.compile(r'"\s*(from [a-z._]+ )?import ([a-z._]+(, )?)+( as [a-z]+)?\\n",?')
for sub_directory, directories_list, files_list in os.walk(this_folder):
    if all(map(lambda x: x not in sub_directory, black_list)):
        for file_name in files_list:
            if file_name.endswith('.ipynb'):
                notebook_name = file_name.split('.')[0]
                documents.append(notebook_name)
                file_path = osp.join(sub_directory, file_name)
                with open(file_path, 'r', encoding=nu.encoding_type) as f:
                    lines_list = f.readlines()
                    imports_list = []
                    for line in lines_list:
                        if import_regex.search(line): imports_list.extend(re.sub(r'\\n",?', '', line.split('import ')[-1]).split(' as ')[0].strip().split(', '))
                    words_list.append(imports_list)

In [4]:

# Combine document names and words for each document
document_data = [' '.join(words_list[i]) for i in range(len(words_list))]

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(document_data)

# Get feature names (words) and document names
feature_names = tfidf_vectorizer.get_feature_names_out()
document_names = [f'Document_{i}' for i in range(len(documents))]

# Create a data frame to display the TF-IDF matrix
tfidf_df = DataFrame(data=tfidf_matrix.toarray(), index=documents, columns=feature_names)

In [5]:

display(tfidf_df.sample(10).T.sample(10).T)

Unnamed: 0,train_test_split,sent_tokenize,generate_item_bank,nltk,get_unique_ngrams,util,copy,get_routine_scores,plot_element_counts,get_ndistinct_subsequences
Analyze Patient Engaged Events,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Develop the Treatment Placement Error Metric,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Calculate Results for Abstract Submission,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Visualize Location Points,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Visualize Elapsed Time Spent on Patient,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Orientation-Normal Sequence Analysis,0.0,0.0,0.0,0.0,0.0,0.0,0.593187,0.0,0.0,0.0
Find Negative Metrics in Jeremy's DCEMS Data,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Visualize every Player Gaze,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Replace UUIDs with Cleaned and Revised File Info,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Analyze TOOL_HOVERing as Indicative of Next Patient Choice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:

# Function to get distinctive words for each document
def get_distinctive_words(tfidf_matrix, document_names, top_n=5):
    distinctive_words = {}

    for i, document_name in enumerate(document_names):
        # Get the TF-IDF scores for the current document
        tfidf_scores = tfidf_matrix.loc[document_name]

        # Get the indices of the top N TF-IDF scores
        top_indices = tfidf_scores.argsort()[-top_n:][::-1]

        # Get the corresponding words for the top N indices
        top_words = [feature_names[index] for index in top_indices]

        # Store distinctive words for the document
        distinctive_words[document_name] = top_words

    return distinctive_words

# Get distinctive words for each document
distinctive_words_per_document = get_distinctive_words(tfidf_df, documents)

In [7]:

from collections import defaultdict

d = {k: str(v) for k, v in distinctive_words_per_document.items()}
grouped_dict = defaultdict(list)
for k, v in d.items(): grouped_dict[v].append(k)

# Display distinctive words for each document group
print(
    'Convert these notebook names (and the libraries they used) into bullet points for a resume describing significant accomplishments'
    ' for my position as Machine Learning Engineer. Keep the libraries used as a parenthetical suffix on the bullet point:\n'
)
for distinctive_words, document_group in grouped_dict.items():
    print(f'{nu.conjunctify_nouns(document_group)}: ({nu.conjunctify_nouns(eval(distinctive_words))})')

Convert these notebook names (and the libraries they used) into bullet points for a resume describing significant accomplishments for my position as Machine Learning Engineer. Keep the libraries used as a parenthetical suffix on the bullet point:

Add a Responder Type Column to the OSU dataset of FRVRS Logs: (re, path, os, numpy, and timedelta)
Build a Model to Predict Tag Applied Type: (train_test_split, tqdm, permutation_importance, numpy, and humanize)
Build Patient Engagement timeline CSV and Analyze Patient Engaged Events: (apriori, association_rules, re, numpy, and timedelta)
Build the OSU dataset of FRVRS Logs: (en_core_web_sm, spacy, re, path, and os)
Analyze Deidentified Simulation Voice Captures: (timedelta, re, numpy, humanize, and pandas)
Analyze Elapsed Time Estimation for Operations: (apriori, association_rules, numpy, humanize, and timedelta)
Analyze Gaze and Intent, Identify any Anomalous Files, Replace the tool applied sender missing patient ID, Develop the Number of P