In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from notebook_utils import NotebookUtilities
from pandas import DataFrame
import os
import os.path as osp
import re
from sklearn.feature_extraction.text import TfidfVectorizer

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)

In [3]:

black_list = ['.ipynb_checkpoints', '$Recycle.Bin', '.git']
github_repos_folder = osp.dirname(osp.abspath(osp.curdir))
file_type = '.ipynb'
import_statements_list = [
    'from apyori import apriori', 'from catsim.cat import generate_item_bank', 'from catsim.irt import icc', 'from collections import defaultdict', 'from datetime import datetime',
    'from datetime import timedelta', 'from mlxtend.frequent_patterns import apriori', 'from mlxtend.frequent_patterns import apriori, association_rules',
    'from mlxtend.frequent_patterns import association_rules', 'from nltk import pos_tag', 'from nltk import sent_tokenize, word_tokenize', 'from nltk import word_tokenize',
    'from nltk import word_tokenize, pos_tag', 'from nltk.chunk import ne_chunk', 'from nltk.corpus import movie_reviews', 'from nltk.corpus import stopwords',
    'from nltk.corpus import subjectivity', 'from nltk.corpus import wordnet as wn', 'from nltk.corpus import words', 'from numpy import arange', 'from os import path as osp',
    'from sklearn.inspection import permutation_importance', 'from sklearn.model_selection import train_test_split', 'from string import punctuation', 'from tqdm import tqdm',
    'import catsim.plot as catplot', 'import copy', 'import csv', 'import docx', 'import en_core_web_sm', 'import humanize', 'import matplotlib.cm as cm',
    'import matplotlib.colors as mcolors', 'import matplotlib.pyplot as plt', 'import nltk.classify.util', 'import nltk.help', 'import numpy as np', 'import operator', 'import os',
    'import os, sys', 'import os.path as osp', 'import pandas', 'import pandas as pd', 'import platform', 'import pystan', 'import random', 'import re', 'import seaborn as sns',
    'import shutil', 'import soundfile as sf', 'import spacy', 'import speech_recognition as sr', 'import string', 'import sys', 'import tempfile', 'from pandas import DataFrame'
]
comment_regex = re.compile(r'"\s*#')
uncleanables_list = ['Build Resume Work Experience from Notebook Names.ipynb', 'Attic.ipynb', 'OS Path Navigation.ipynb', 'Installs.ipynb']
def get_library_names_list(line, module_vs_nickname=-1):
    library_names_list = line.split('"')[1].split('\\n')[0].split('import ')[1].split(', ')
    library_names_list = [library_name.split(' as ')[module_vs_nickname] for library_name in library_names_list]
    
    return library_names_list

In [4]:

import pkgutil
import site

BUILTIN_LIBRARIES_LIST = []
# print("Built-in libraries:")
for _, name, _ in pkgutil.iter_modules():
    BUILTIN_LIBRARIES_LIST.append(name)
BUILTIN_LIBRARIES_LIST += ['sys']
modules_list = []
# print("\nThird-party modules:")
for path in site.getsitepackages():
    for _, name, _ in pkgutil.iter_modules([path]):
        modules_list.append(name)

In [5]:

for import_statement in import_statements_list:
    print(
        import_statement,
        [library_name.split('.')[0] for library_name in get_library_names_list(f'    "{import_statement}\\n",\n') if library_name.split('.')[0] not in BUILTIN_LIBRARIES_LIST]
    )

from apyori import apriori ['apriori']
from catsim.cat import generate_item_bank ['generate_item_bank']
from catsim.irt import icc ['icc']
from collections import defaultdict ['defaultdict']
from datetime import datetime []
from datetime import timedelta ['timedelta']
from mlxtend.frequent_patterns import apriori ['apriori']
from mlxtend.frequent_patterns import apriori, association_rules ['apriori', 'association_rules']
from mlxtend.frequent_patterns import association_rules ['association_rules']
from nltk import pos_tag ['pos_tag']
from nltk import sent_tokenize, word_tokenize ['sent_tokenize', 'word_tokenize']
from nltk import word_tokenize ['word_tokenize']
from nltk import word_tokenize, pos_tag ['word_tokenize', 'pos_tag']
from nltk.chunk import ne_chunk ['ne_chunk']
from nltk.corpus import movie_reviews ['movie_reviews']
from nltk.corpus import stopwords ['stopwords']
from nltk.corpus import subjectivity ['subjectivity']
from nltk.corpus import wordnet as wn ['wn']
from nltk.cor

In [6]:

# Highlight imports that are not being used
file_dict = {}
for sub_directory, directories_list, files_list in os.walk(github_repos_folder):
    if all(map(lambda x: x not in sub_directory, black_list)):
        for file_name in files_list:
            if file_name.endswith(file_type) and file_name not in uncleanables_list:
                file_path = osp.join(sub_directory, file_name)
                with open(file_path, 'r', encoding=nu.encoding_type) as f:
                    lines_list = f.readlines()
                    import_indices_list = []
                    
                    # Find the import line indices
                    for i, line in enumerate(lines_list):
                        if any(map(lambda x: x in line, import_statements_list)): import_indices_list.append(i)
                    
                    # Get all the unused imports
                    removes_set = set()
                    for i in import_indices_list:
                        line = lines_list[i]
                        if not comment_regex.search(line):
                            library_names_list = get_library_names_list(line, -1)
                            for library_name in library_names_list:
                                call_regex = re.compile(rf'\b{library_name}(\.|\()')
                                is_used = False
                                for line in lines_list[i:]:
                                    if call_regex.search(line): is_used = True
                                if not is_used: removes_set.add(library_name)
                    if removes_set: file_dict[file_path] = r'\b(' + '|'.join(sorted(removes_set)) + r')(\b|\.)'

In [7]:

# List imports that are not being used
for k, v in file_dict.items(): print(f'{k}: {v}')

In [8]:

# List of notebook names
documents = []

# List of words for each document
words_list = []

import_regex = re.compile(r'"\s*(from [a-z._]+ )?import ([a-z._]+(?:, )?)+( as [a-z]+)?\\n",?')
for sub_directory, directories_list, files_list in os.walk(github_repos_folder):
    if all(map(lambda x: x not in sub_directory, black_list)):
        for file_name in files_list:
            if file_name.endswith(file_type) and file_name not in uncleanables_list:
                notebook_name = file_name.replace(file_type, '')
                # if (notebook_name == 'Untitled'): raise
                documents.append(notebook_name)
                file_path = osp.join(sub_directory, file_name)
                with open(file_path, 'r', encoding=nu.encoding_type) as f:
                    lines_list = f.readlines()
                    imports_list = []
                    for line in lines_list:
                        if import_regex.search(line):
                            # display(line)
                            library_names_list = get_library_names_list(line, 0)
                            for library_name in library_names_list:
                                if library_name.split('.')[0] not in BUILTIN_LIBRARIES_LIST: imports_list.append(library_name)
                    words_list.append(imports_list)

In [9]:

# Combine document names and words for each document
document_data = [' '.join(words_list[i]) for i in range(len(words_list))]

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(document_data)

# Get feature names (words) and document names
feature_names = tfidf_vectorizer.get_feature_names_out()
document_names = [f'Document_{i}' for i in range(len(documents))]

# Create a data frame to display the TF-IDF matrix
tfidf_df = DataFrame(data=tfidf_matrix.toarray(), index=documents, columns=feature_names)

In [10]:

display(tfidf_df.sample(10).T.sample(10).T)

Unnamed: 0,path,get_routine_scores,catsim,words,get_first_positions,sent_tokenize,icc,pos_tag,plot_transition_matrix,get_element_counts
Tool Applied Exploration,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fix Elapsed Time Simultaneity,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Analyze START Triage vs SALT Triage,0.833476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Build the OSU dataset of FRVRS Logs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Develop the Patient Accuracy Rate Metric,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Develop the Time to First Treatment Metric,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Analyze Deidentified Simulation Voice Captures,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Analyze Gaze and Intent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Develop the Treatment Placement Error Metric,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Develop the Triage Accuracy Metric,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:

# Function to get distinctive words for each document
def get_distinctive_words(tfidf_matrix, document_names, top_n=5):
    distinctive_words = {}

    for i, document_name in enumerate(document_names):
        # Get the TF-IDF scores for the current document
        tfidf_scores = tfidf_matrix.loc[document_name]

        # Get the indices of the top N TF-IDF scores
        top_indices = tfidf_scores.argsort()[-top_n:][::-1]

        # Get the corresponding words for the top N indices
        top_words = [feature_names[index] for index in top_indices]

        # Store distinctive words for the document
        distinctive_words[document_name] = top_words

    return distinctive_words

# Get distinctive words for each document
distinctive_words_per_document = get_distinctive_words(tfidf_df, documents)

In [12]:

from collections import defaultdict

d = {k: str(v) for k, v in distinctive_words_per_document.items()}
grouped_dict = defaultdict(list)
for k, v in d.items(): grouped_dict[v].append(k)

# Display distinctive words for each document group
print(
    'Convert these notebook names (and the libraries they used) into bullet points for a resume describing significant accomplishments'
    ' for my position as Machine Learning Engineer. Keep the libraries used as a parenthetical suffix on the bullet point:\n'
)
for distinctive_words, document_group in grouped_dict.items():
    print(f'{nu.conjunctify_nouns(document_group)}: ({nu.conjunctify_nouns(eval(distinctive_words))})')

Convert these notebook names (and the libraries they used) into bullet points for a resume describing significant accomplishments for my position as Machine Learning Engineer. Keep the libraries used as a parenthetical suffix on the bullet point:

Add a Responder Type Column to the OSU dataset of FRVRS Logs, Analyze Gaze and Intent, Analyze Issue with Logging Multiple TOOL_APPLIEDs, Analyze Preliminary Research Questions, Analyze TOOL_HOVERing as Indicative of Next Patient Choice, Find Negative Metrics in Jeremy's DCEMS Data, Orientation-Normal Sequence Analysis, Tool Applied Exploration, Fix Elapsed Time Simultaneity, Identify any Anomalous Files, Rename Files, Replace the tool applied sender missing patient ID, Reserialize DataFrame pickles, Develop the Correct Count Triage Accuracy Metric, Develop the Number of Patients Treated Metric, Develop the Number of Pulses Taken Metric, Develop the Number of Voice Captures per Session Metric, Develop the Patient Accuracy Rate Metric, Develop