<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/main/drafts/300_ClassicNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import os
# Determines the current environment (Google Colab or local)
def check_environment():
    try:
        import google.colab
        return "Google Colab"
    except ImportError:
        pass

    return "Local Environment"

In [31]:
env = check_environment()

if env == "Google Colab":
    print("Running in Google Colab")
    # !pip install -q datasets
    !python -m spacy download nl_core_news_md
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    DATA_DIR = '/content/drive/My Drive/Colab Notebooks/GenCareAI/data'
    HF_TOKEN = userdata.get('HF_TOKEN')
else:
    print("Running in Local Environment")
    DATA_DIR = '../data'
    # !python -m spacy download nl_core_news_md
    from dotenv import load_dotenv
    load_dotenv()
    HF_TOKEN = os.getenv('HF_TOKEN')

Running in Google Colab
Collecting nl-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_md-3.7.0/nl_core_news_md-3.7.0-py3-none-any.whl (42.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.7/42.7 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('nl_core_news_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import pandas as pd
from datasets import load_dataset
import spacy
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed


In [33]:
df_notes = load_dataset('ekrombouts/dutch_nursing_home_notes', token=HF_TOKEN)
df_notes = pd.DataFrame(df_notes['train'])


In [34]:
# Initialiseer spaCy
nlp = spacy.load('nl_core_news_md')

In [35]:
# Functie voor het voorverwerken van de tekst
def preprocess_text(text):
    # Controleer op nan-waarden
    if pd.isnull(text):
        return ''

    # Verwerk de tekst met spaCy
    doc = nlp(text.lower())

    # Selecteer de lemmata van de tokens met bepaalde PoS-tags
    lemmatized_text = ' '.join([token.lemma_ for token in doc if token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'}])

    return lemmatized_text

In [37]:
aap = df_notes['note'][0]
print(aap)
print(preprocess_text(aap))

Mevrouw heeft vanmorgen hulp gekregen bij het aankleden en klaarmaken voor de dag.
mevrouw vanmorgen hulp krijgen aankleden klaarmaken dag


In [36]:
# Paralleliseer de tekstvoorverwerking met een progressbar
def parallel_preprocess_texts(texts, max_workers=4):
    results = [None] * len(texts)
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(preprocess_text, text): i for i, text in enumerate(texts)}
        for future in tqdm(as_completed(futures), total=len(futures)):
            index = futures[future]
            results[index] = future.result()
    return results


In [38]:
# Pas de functie toe op de kolom 'note'
# tqdm.pandas()
# df_notes['processed_note'] = df_notes['note'].progress_apply(preprocess_text)
df_notes['processed_note'] = parallel_preprocess_texts(df_notes['note'].tolist())

100%|██████████| 43283/43283 [07:36<00:00, 94.89it/s] 


In [39]:
# Controleer het resultaat
print(df_notes[['note', 'processed_note']].head())

                                                note  \
0  Mevrouw heeft vanmorgen hulp gekregen bij het ...   
1  Meneer is gedoucht en zijn gebitsprothese is s...   
2  Tijdens het ochtendritueel is mw. geholpen met...   
3  Dhr. is vanochtend gewassen en geholpen met aa...   
4  Na het ongelukje is mw. verschoond en is haar ...   

                                      processed_note  
0  mevrouw vanmorgen hulp krijgen aankleden klaar...  
1         meneer douchten gebitsprothese schoonmaken  
2  ochtendritueel helpen tand poetsen klaargemaak...  
3                  vanochtend wassen helpen aanklead  
4                      ongelukje verschoond gedoucht  


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Maak een CountVectorizer object
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='dutch')

# Pas de vectorizer toe op de processed notes
X = vectorizer.fit_transform(df_notes['processed_note'])

# Maak een LDA object aan
lda = LatentDirichletAllocation(n_components=10, random_state=42)

# Pas LDA toe op de getransformeerde data
lda.fit(X)

# Functie om de top woorden per topic weer te geven
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

# Print de top woorden per topic
n_top_words = 10
tf_feature_names = vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)