In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from notebook_utils import NotebookUtilities
from pandas import DataFrame
import os
import os.path as osp
import re
from sklearn.feature_extraction.text import TfidfVectorizer

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)

In [11]:

# Get a list of notebook names
black_list = ['.ipynb_checkpoints', '$Recycle.Bin']
this_folder = osp.dirname(osp.abspath(osp.curdir))

# List of documents
documents = []

# List of words for each document
words_list = []

file_type = '.ipynb'
# import_regex = re.compile(r'"\s*(?:from\s+(\w+)(?:\.\w+)?\s+)?import\s+([^\s,.]+)(?:\.\w+)?((\s*,\s*\w+)*)?\\n",?')
import_regex = re.compile(r'"\s*(from [a-z._]+ )?import ([a-z._]+(, )?)+( as [a-z]+)?\\n",?')
for sub_directory, directories_list, files_list in os.walk(this_folder):
    if all(map(lambda x: x not in sub_directory, black_list)):
        for file_name in files_list:
            if file_name.endswith(file_type):
                notebook_name = file_name.replace(file_type, '')
                documents.append(notebook_name)
                file_path = osp.join(sub_directory, file_name)
                with open(file_path, 'r', encoding=nu.encoding_type) as f:
                    lines_list = f.readlines()
                    imports_list = []
                    for line in lines_list:
                        if import_regex.search(line): imports_list.extend(re.sub(r'\\n",?', '', line.split('import ')[-1]).split(' as ')[0].strip().split(', '))
                    words_list.append(imports_list)

In [10]:

[dn for dn in documents if 'Indeed' in dn]

['Load Jobs from list on Indeed', 'Indeed API', 'Indeed Application Status', 'Indeed.com Sequence Analysis', 'Indeed.com Web Spoofing', 'Indeed Scrapes ETL', 'Scrape Indeed.com View Job Pages', 'Indeed Header Classifier Scores', 'Get Requirements Lists from Indeed.com']

In [12]:

# Combine document names and words for each document
document_data = [' '.join(words_list[i]) for i in range(len(words_list))]

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(document_data)

# Get feature names (words) and document names
feature_names = tfidf_vectorizer.get_feature_names_out()
document_names = [f'Document_{i}' for i in range(len(documents))]

# Create a data frame to display the TF-IDF matrix
tfidf_df = DataFrame(data=tfidf_matrix.toarray(), index=documents, columns=feature_names)

In [13]:

display(tfidf_df.sample(10).T.sample(10).T)

Unnamed: 0,request,datetime,tarfile,inspect,sklearn_crfsuite,atmodel,jupyter_config_path,winshell,webbrowser,scrapeghost
Update NavigableParentSequence Table from Examples,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Create the NavigableParent is Header Dictionary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Installs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Build a Complete Part-of-Speech Dictionary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Build Other Non-headers List,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Evaluate LDA on Labeled Data,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Build O-TS Dictionary from Examples,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Indeed Application Status,0.433038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Build the Child Strings List Dictionary from Examples,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Build NavigableParents SQL from Examples,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:

# Get distinctive words for each document
distinctive_words = {}
for i, document_name in enumerate(documents):
    
    # Get the TF-IDF scores for the current document
    tfidf_scores = tfidf_df.loc[document_name]
    
    # Get the indices of the top N TF-IDF scores
    top_indices = tfidf_scores.argsort()[-5:][::-1]
    
    # Get the corresponding words for the top N indices
    top_words = [feature_names[index] for index in top_indices]
    
    # Store distinctive words for the document
    distinctive_words[document_name] = top_words

In [17]:

from collections import defaultdict

d = {k: str(v) for k, v in distinctive_words.items()}
grouped_dict = defaultdict(list)
for k, v in d.items(): grouped_dict[v].append(k)

# Display distinctive words for each document group
print(
    'Convert these notebook names (and the libraries they used) into bullet points for a resume describing significant accomplishments'
    ' for my position as Machine Learning Engineer. Keep the libraries used as a parenthetical suffix on the bullet point:\n'
)
for distinctive_words, document_group in grouped_dict.items():
    print(f'{nu.conjunctify_nouns(document_group)}: ({nu.conjunctify_nouns(eval(distinctive_words))})')

Convert these notebook names (and the libraries they used) into bullet points for a resume describing significant accomplishments for my position as Machine Learning Engineer. Keep the libraries used as a parenthetical suffix on the bullet point:

Add file_name column to HeaderTagSequence.csv, Add Hashes of File Contents to FileNames, Indeed API, and Installs: (sys, zipfile, matplotlib, itertools, and json)
Build a Complete Part-of-Speech Dictionary, _Untitled_2, and Cypher Exploration: (pandas, zipfile, matplotlib, itertools, and json)
Build Corporate Scope Non-headers List, Build Educational Requirement Non-headers List, Build Interview Procedure Non-headers List, Build Job Duration Non-headers List, Build Job Title Non-headers List, Build Legal Notifications Non-headers List, Build Office Location Non-headers List, Build Other Non-headers List, Build Posting Date Non-headers List, Build Preferred Requirements Non-headers List, Build Qualification Non-headers List, and Build Suppleme