# Agitation prediction: Feature extraction

**Author:** Eva Rombouts  
**Date:** 2024-07-19  
**Version:** 1.0

### Description
This script performs text preprocessing, topic modeling, and word embedding training on a dataset of care reports. It produces three output files: agitation_train, agitation_valid, and agitation_test, each containing extracted features. These features are prepared for subsequent modeling to predict agitation based on the text data.

In [None]:
# Environment setup
import os

def check_environment():
    try:
        import google.colab
        return "Google Colab"
    except ImportError:
        return "Local Environment"

env = check_environment()
if env == "Google Colab":
    print("Running in Google Colab")
    !python -m spacy download nl_core_news_sm -q
    !pip install -q pyLDAvis
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    os.chdir('/content/drive/My Drive/Colab Notebooks/GenCareAI/scripts')
else:
    print("Running in Local Environment")
    # Unable to get it working locally due to persistent ImportError with SciPy's 'triu' function.
    # Therefore, switching to Colab where the environment is pre-configured and works seamlessly.
    # !pip install -q gensim spacy wordcloud pyLDAvis
    # !python -m spacy download nl_core_news_sm -q

In [None]:
# Libraries for data manipulation 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
tqdm.pandas()

# Visualisation libraries
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# NLP & Machine Learning libraries
import spacy
import gensim
from gensim import corpora
from gensim.models import Word2Vec

# Parameters 
num_topics = 10
sample_size = 1000
seed = 6
no_below = 2   # Infrequency limit, minimum number of times the word needs to be in the texts
no_above = 0.5 # Frequency limit, if a word appears in more percent of the texts it will be filtered out.

# Dutch SpaCy model
nlp = spacy.load('nl_core_news_sm', disable=['parser', 'tagger', 'ner'])

In [None]:
# Load and sample data
df = pd.read_csv('../data/agitation.csv')
# df = df.sample(sample_size, random_state=seed)
df.info()

## Text preprocessing

In [None]:
# Function for text preprocessing: lowercasing, tokenization, lemmatization, stop word removal, and word selection based on part-of-speech
def preprocess_text(text, nlp_model):
    doc = nlp_model(text)
    cleaned_tokens = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop and token.pos_ in ['VERB', 'NOUN', 'ADJ', 'ADV', 'INTJ']]
    return " ".join(cleaned_tokens)

# Apply preprocessing to the dataframe
df['text_clean'] = df['text'].progress_apply(lambda x: preprocess_text(x, nlp))

## Splitten in train/test/validatie

In [None]:
# Split into train/test/validation sets
train_df, temp = train_test_split(df, test_size=0.4, random_state=seed)
valid_df, test_df = train_test_split(temp, test_size=0.5, random_state=seed)

train_df.info()

## Topic modelling

In [None]:
# Function to plot word clouds for topics
def plot_wordclouds(lda_model, dictionary):
    num_topics = lda_model.num_topics
    cols = 5
    rows = (num_topics // cols) + (num_topics % cols > 0)  # Berekent het aantal rijen

    plt.figure(figsize=(20, 10))  # Aanpassen van de breedte en hoogte van de plot
    for idx in range(num_topics):
        plt.subplot(rows, cols, idx + 1)
        topic_words = dict(lda_model.show_topic(idx, 30))
        cloud = WordCloud(background_color='white').generate_from_frequencies(topic_words)
        plt.imshow(cloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Topic ' + str(idx+1))

    plt.show()

In [None]:
# Het model wordt gemaakt obv de trainingset, en vervolgens toegepast op alle data

# Tokenize cleaned texts
tokenized_docs_train = [[token.text for token in nlp(doc)] for doc in train_df['text_clean']]
tokenized_docs_test = [[token.text for token in nlp(doc)] for doc in test_df['text_clean']]
tokenized_docs_valid = [[token.text for token in nlp(doc)] for doc in valid_df['text_clean']]

In [None]:
# Create a dictionary
dictionary = corpora.Dictionary(tokenized_docs_train)
dictionary.filter_extremes(no_below=no_below, no_above=no_above)

In [None]:
# Convert documents to a bag-of-words representation
corpus_train = [dictionary.doc2bow(doc) for doc in tokenized_docs_train]
corpus_test = [dictionary.doc2bow(doc) for doc in tokenized_docs_test]
corpus_valid = [dictionary.doc2bow(doc) for doc in tokenized_docs_valid]

In [None]:
# Train LDA model. The model is trained on the training set only
lda_model = gensim.models.LdaMulticore(corpus_train, num_topics=num_topics, id2word=dictionary, passes=10, workers=2, random_state=seed)

In [None]:
# Plot topics
plot_wordclouds(lda_model, dictionary)

In [None]:
# Visualize topics with pyLDAvis
lda_display = gensimvis.prepare(lda_model, corpus_train, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
# Function to calculate topic distributions
def get_topic_distributions(lda_model, corpus):
    topic_distributions = []

    for doc_bow in corpus:
        doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
        topic_distribution = {f'topic_{i}': 0 for i in range(num_topics)}
        for topic, prob in doc_topics:
            topic_distribution[f'topic_{topic}'] = prob
        topic_distributions.append(topic_distribution)

    return topic_distributions


In [None]:
# Calculate topic distributions for all documents
train_topic_dists = get_topic_distributions(lda_model, corpus_train)
test_topic_dists = get_topic_distributions(lda_model, corpus_test)
valid_topic_dists = get_topic_distributions(lda_model, corpus_valid)

In [None]:
# Add topic distributions to DataFrames
for topic in range(num_topics):
    train_df[f'topic_{topic}'] = [dist[f'topic_{topic}'] for dist in train_topic_dists]
    test_df[f'topic_{topic}'] = [dist[f'topic_{topic}'] for dist in test_topic_dists]
    valid_df[f'topic_{topic}'] = [dist[f'topic_{topic}'] for dist in valid_topic_dists]

## Word embeddings

In [None]:
# Prepare text data for word embeddings
train_texts = [doc.split() for doc in train_df['text_clean']]  # Splits in tokens
# Train Word2Vec-model
word2vec_model = Word2Vec(sentences=train_texts, vector_size=50, window=5, min_count=1, workers=4)

In [None]:
# Function to calculate document embeddings
def calculate_document_embedding(text, model):
    embeddings = [model.wv[word] for word in text.split() if word in model.wv]

    if not embeddings:
        return pd.Series(np.zeros(model.vector_size))

    mean_embedding = np.mean(embeddings, axis=0)
    return pd.Series(mean_embedding)

In [None]:
# Calculate document embeddings and add to DataFrames
embedding_columns = [f'embedding_{i}' for i in range(word2vec_model.vector_size)]
train_df[embedding_columns] = train_df['text_clean'].apply(lambda x: calculate_document_embedding(x, word2vec_model))

test_df[embedding_columns] = test_df['text_clean'].apply(lambda x: calculate_document_embedding(x, word2vec_model))
valid_df[embedding_columns] = valid_df['text_clean'].apply(lambda x: calculate_document_embedding(x, word2vec_model))


In [None]:
# Example usage of the trained model
word_embedding = word2vec_model.wv['moeder']
print(word_embedding)

In [None]:
# Find words most similar to 'toilet'
similar_words = word2vec_model.wv.most_similar('toilet', topn=10)

for word, similarity in similar_words:
    print(f"Word: {word}, Similarity: {similarity}")


In [None]:
# Save DataFrames
train_df.to_csv('../data/agitation_train.csv', index=False)
test_df.to_csv('../data/agitation_test.csv', index=False)
valid_df.to_csv('../data/agitation_valid.csv', index=False)