In [None]:
# Install the Spanish language model if not already installed
%pip install spacy nltk pandas
!python -m spacy download es_core_news_lg

In [None]:
import pandas as pd
from collections import Counter
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download Spanish stop words if not already downloaded
nltk.download('stopwords')

# Get Spanish stop words
spanish_stop_words = set(stopwords.words('spanish'))

# Read the CSV file
df = pd.read_csv('../csv_import_scripts/cie10-es-diagnoses.csv')

# Create a dictionary to store words and their associated codes
word_code_dict = {}

# Process each row in the dataframe
for index, row in df.iterrows():
    # Get the text and code from each row
    text = str(row['description']).lower()
    code = row['code']

    # Clean text and split into words
    words = re.findall(r'\b[\w-]+\b', text)

    # Add non-stop words and their codes to dictionary
    for word in words:
        if word not in spanish_stop_words:  # Only process if not a stop word
            if word not in word_code_dict:
                word_code_dict[word] = {'count': 0, 'codes': set()}
            word_code_dict[word]['count'] += 1
            word_code_dict[word]['codes'].add(code)

# Convert to DataFrame for better visualization
result_df = pd.DataFrame([
    {
        'word': word,
        'count': data['count'],
        'codes': ', '.join(sorted(data['codes'])),
        'num_codes': len(data['codes']),
        'main_categories': ', '.join(sorted(set(code[0] for code in data['codes'])))
    }
    for word, data in word_code_dict.items()
])

# Sort by count in descending order
result_df = result_df.sort_values('count', ascending=False)

# Display the first few rows
print(f"Total unique words found: {len(result_df)}")
result_df.head(10)

In [None]:

import spacy

# Add lemmatization support
# Load Spanish language model
nlp = spacy.load("es_core_news_lg")

# Add POS tag and dependency analysis
result_df['pos_tag'] = result_df['word'].apply(lambda x: nlp(x)[0].pos_)
result_df['spacy_lemma'] = result_df['word'].apply(lambda x: nlp(x)[0].lemma_)

nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')

# Create lemmatizer
lemmatizer = WordNetLemmatizer()

# Add lemmatized words to result dataframe
result_df['root_word'] = result_df['word'].apply(lambda x: lemmatizer.lemmatize(x))

# Show updated dataframe
result_df.to_csv('cie10_word_analysis.csv', index=False, encoding='utf-8')
result_df.head(10)