# **STEP 1. Run the following cell to import the necessary libraries the first time you launch the runtime**
It may takes some seconds

In [None]:
#@title
!pip install --upgrade numpy
import subprocess

def install(package):
    subprocess.check_call(["pip", "install", package])

# Check if pyLDAvis is installed
try:
    import pyLDAvis
except ImportError:
    # If pyLDAvis is not installed, install it
    install("pyLDAvis")

# Check if gensim is installed
try:
    import gensim
except ImportError:
    # If gensim is not installed, install it
    install("gensim")

# Check if spacy is installed
try:
    import spacy
except ImportError:
    # If spacy is not installed, install it
    install("spacy")

# Check if matplotlib is installed
try:
    import matplotlib
except ImportError:
    # If matplotlib is not installed, install it
    install("matplotlib")

# Check if seaborn is installed
try:
    import seaborn
except ImportError:
    # If seaborn is not installed, install it
    install("seaborn")

# Check if the en_core_web_md model for spacy is installed
try:
    import en_core_web_md
except ImportError:
    # If the model is not installed, download it using spacy
    !python -m spacy download en_core_web_md -qq

# Check if gensim is installed
try:
    import gensim
except ImportError:
    install("gensim")

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
from gensim.models import LdaModel
#from gensim.summarization import summarize

import warnings
import pandas as pd
import matplotlib.pyplot as plt
import io
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()# Visualise inside a notebook
import en_core_web_md
import random
import numpy as np
import ipywidgets as widgets
from IPython.display import display

# **STEP 2. Run the following cell to upload the file and select model parameters**

Note: the excel file must contain a header row that is not considered in the code. All the course/job description must be one per row. No limit in words per row (at least 100 words recomended). At least 100 courses/job description are recomended.

In [2]:
#@title
# FUNCTION to preprocess data
def preprocess_data(df):
    # Tags I want to remove from the text
    removal = ['ADJ','ADV', 'AUX', 'INTJ', 'PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']

    # Words I want to remove (must be in lowercase and in the singular form)
    cls=spacy.util.get_lang_class('en')
    cls.Defaults.stop_words |= {'exercise', 'exercises', 'hour', 'hours', 'analysis', 'system', 'systems', 'student', 'students', 'expected', 'knowledges', 'method', 'methodology', 'methodologies', 'methods', 'problem', 'problems', 'model', 'models', 'modelling', 'project', 'projects', 'tutorial', 'tutorials', 'course', 'courses'}

    # Our spaCy model
    nlp = spacy.load("en_core_web_sm")

    tokens = []
    for courseDescription in nlp.pipe(df.iloc[:, 0]):
        proj_tok = [token.lemma_.lower() for token in courseDescription if token.pos_ not in removal and not token.is_stop and token.is_alpha]
        tokens.append(proj_tok) #I am saving the tokens in the db new column named tokens

    df['tokens'] = tokens
    return df

# FUNCTION to generate the LDA visualization
def generate_lda_model(df, num_topics, passes, no_below, no_above, random_seed):
    # Clean the data
    df = preprocess_data(df)

    # Create the dictionary
    tokens = df['tokens']
    dictionary = Dictionary(tokens)

    # Filter the dictionary
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)

    # Create the corpus
    corpus = [dictionary.doc2bow(doc) for doc in tokens]

    # Set the seed for the random number generator
    random.seed(random_seed)
    np.random.seed(random_seed)

    # Train the LDA model
    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, random_state=random_seed, num_topics=num_topics, passes=passes)

    # Generate the LDA visualization
    lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    return  lda_model, lda_display


# Define the widgets
file_upload = widgets.FileUpload(accept='.xlsx', description='Upload Excel file containing one course/job description per row')
num_topics = widgets.IntSlider(min=2, max=20, value=6, step=1, description='N Topics')
passes = widgets.IntSlider(min=1, max=2000, value=10, step=1, description='N Passes')
random_seed = widgets.IntSlider(min=1, max=10, value=5, step=1, description='Randomseed')
no_below = widgets.IntSlider(min=1, max=10, value=3, step=1, description='Min count word')
no_above = widgets.FloatSlider(min=0.1, max=1.0, step=0.1, value=0.9, description='Max count %')
submit_button = widgets.Button(description='Save file and params')

# Display the widgets
display(file_upload)
display(num_topics)
display(passes)
display(random_seed)
display(no_below)
display(no_above)
display(submit_button)

def on_submit_button_clicked(b):
    # Read the data from the uploaded file into a pandas DataFrame
    content = file_upload.value[next(iter(file_upload.value))]
    df = pd.read_excel(io.BytesIO(content['content']))

# Attach the function to the button click event
submit_button.on_click(on_submit_button_clicked)


  and should_run_async(code)


FileUpload(value={}, accept='.xlsx', description='Upload Excel file containing one course/job description per …

IntSlider(value=6, description='N Topics', max=20, min=2)

IntSlider(value=10, description='N Passes', max=2000, min=1)

IntSlider(value=5, description='Randomseed', max=10, min=1)

IntSlider(value=3, description='Min count word', max=10, min=1)

FloatSlider(value=0.9, description='Max count %', max=1.0, min=0.1)

Button(description='Save file and params', style=ButtonStyle())

# **STEP 3. Run the following code to display the Archetype in the form of topics extracted as clusters**

In [None]:
#@title
warnings.filterwarnings("ignore")
content = file_upload.value[next(iter(file_upload.value))]
df = pd.read_excel(io.BytesIO(content['content']))
lda_model = generate_lda_model(df, num_topics.value, passes.value, no_below.value, no_above.value, random_seed.value)

pyLDAvis.display(lda_model[1])

# **STEP 4. Download the Archetype in the form of 100 technical keywords (content) and Bloom Verbs (verbs) of each identified Topic.**
*Note: Content can be e.g., retrived from ChatGPT by quering "provide me a content of max 20 words for the topic which include these tehcnical keywords [...]" and list the keywods*

In [None]:
#@title
#create functions that I need
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from google.colab import files

def is_not_bloomverb(word, bloom_verbs):
    lemmatizer = WordNetLemmatizer()
    lemma = lemmatizer.lemmatize(word, 'v')
    return lemma not in bloom_verbs

def remove_verb_forms(word_list):
    verbs = set([word for synset in wordnet.all_synsets('v') for word in synset.lemma_names()])
    infinitive_verbs = set()
    for verb in verbs:
        infinitive_verbs.update({lemma.name() for lemma in wordnet.lemmas(verb, 'v') if lemma.synset().pos() == 'v' and lemma.derivationally_related_forms() == []})
    verbs_to_remove = verbs - infinitive_verbs
    return [[word for word in sub_list if word not in verbs_to_remove] for sub_list in word_list]

#Grab the first keywords 100 keywords
KW=[]
verb=[]
bloomverbs = ["understand", "define", "identify", "describe", "label", "list", "name", "state", "match", "recognize", "select", "examine", "locate", "memorize", "quote", "recall", "reproduce", "tabulate", "tell", "copy", "discover", "duplicate", "enumerate", "listen", "observe", "omit", "read", "recite", "record", "repeat", "retell", "visualize", "explain", "interpret", "paraphrase", "summarize", "classify", "compare", "differentiate", "discuss", "distinguish", "extend", "predict", "associate", "contrast", "convert", "demonstrate", "estimate", "express", "infer", "relate", "restate", "translate", "ask", "cite", "discover", "generalize", "group", "illustrate", "judge", "observe", "order", "report", "represent", "research", "review", "rewrite", "show", "trace", "solve", "apply", "modify", "use", "calculate", "change", "choose", "discover", "relate", "sketch", "complete", "construct", "interpret", "manipulate", "paint", "prepare", "teach", "act", "compute", "list", "practice", "simulate", "write", "analyze", "classify", "contrast", "infer", "select", "categorize", "connect", "differentiate", "estimate", "evaluate", "focus", "organize", "plan", "question", "test", "reframe", "criticize", "appraise", "support", "decide", "recommend", "assess", "convince", "defend", "grade", "predict", "select", "argue", "conclude", "critique", "debate", "justify", "persuade", "weigh", "design", "compose", "plan", "combine", "formulate", "invent", "substitute", "compile", "develop", "integrate", "modify", "prepare", "rearrange", "adapt", "arrange", "collaborate", "facilitate", "make", "manage", "propose", "solve", "support", "test", "validate"]

for nTopic in range(num_topics.value):
  KW.append([])
  for a in range(100):
    word = lda_model[0].show_topic(nTopic, topn=100)[a][0]
    if is_not_bloomverb(word, bloomverbs):
      KW[nTopic].append(word)
    else:
      KW[nTopic].append('*' + word + '*')

KW_no_verb = remove_verb_forms(KW)
KW_no_verb_df = pd.DataFrame({'Technical content and bloom verb in bolded font': KW_no_verb})

#save in a table

KW_no_verb_df.to_excel(r'ArchetypeFinal.xlsx', index=False)
files.download('ArchetypeFinal.xlsx')