# Creación de features

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from textblob import TextBlob
from gensim import corpora, models
from gensim.models import Word2Vec
import spacy
from datetime import datetime, timedelta
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score

In [2]:
# Extraemos la data preprocesada

# Defino los paths de los archivos Excel
pipomessage_path = "Preprocesado/pipomessage.xlsx"
user_path = "Preprocesado/user.xlsx"
usermessage_path = "Preprocesado/usermessage.xlsx"
symptoms_path = "Preprocesado/symptoms.xlsx"

# Cargar los datos en pandas DataFrames
pipomessage_df = pd.read_excel(pipomessage_path)
user_df = pd.read_excel(user_path)
usermessage_df = pd.read_excel(usermessage_path)
symptoms_df = pd.read_excel(symptoms_path)

### Sintomas, intensidad y tipo de cancer

In [3]:
train_user_df = user_df[user_df['cancer_type'] != 'desconocido']
merged_df = pd.merge(train_user_df, symptoms_df, left_on='phone_number', right_on='user')

# Check 
merged_df.head()

Unnamed: 0,phone_number,creation_date_x,cancer_type,id,user,creation_date_y,description,intensity
0,883434410131,2023-01-31 13:21:22.722753+00:00,cancer de mama,3657,883434410131,2023-02-01 13:55:26.490979+00:00,dolor corporal,6
1,883434410131,2023-01-31 13:21:22.722753+00:00,cancer de mama,3832,883434410131,2023-02-06 19:46:54.420442+00:00,dolor de vientre,6
2,883434410131,2023-01-31 13:21:22.722753+00:00,cancer de mama,3913,883434410131,2023-02-08 18:32:38.076702+00:00,sangrado,6
3,883811578438,2023-04-12 23:46:23.650111+00:00,cancer de mama,7157,883811578438,2023-04-18 17:10:31.079410+00:00,dolor de seno,6
4,9742951077857,2023-02-18 16:21:09.167504+00:00,cancer de ovario,4553,9742951077857,2023-02-24 15:18:12.711123+00:00,dolor corporal,6


In [4]:
columnas_a_eliminar = ['creation_date_x', 'id','user','creation_date_y']
merged_df = merged_df.drop(columnas_a_eliminar, axis=1)
# Check 
merged_df.head()

Unnamed: 0,phone_number,cancer_type,description,intensity
0,883434410131,cancer de mama,dolor corporal,6
1,883434410131,cancer de mama,dolor de vientre,6
2,883434410131,cancer de mama,sangrado,6
3,883811578438,cancer de mama,dolor de seno,6
4,9742951077857,cancer de ovario,dolor corporal,6


### Tratamientos

In [5]:
# Creamos nuestra lista de tratamientos de cancer mas conocidos
treatment_keywords = ['quimioterapia', 'radioterapia', 'cirugía', 'inmunoterapia', 'terapia hormonal',
                      'terapia dirigida', 'radiación', 'transplante de médula ósea', 'terapia de protones',
                      'terapia de células T', 'biopsia', 'mastectomía', 'lumpectomía', 'resección', 'ablación',
                      'embolización', 'estereotáctica', 'radiocirugía', 'braquiterapia', 'crioablación']

# Creamos una dataframe que cuente con los tratamiento.  Si se menciona en el mensaje del usuario y con un 0 si no se menciona.
vectorizer = CountVectorizer(vocabulary=treatment_keywords, binary=True)
treatments = vectorizer.transform(usermessage_df['body'].values.astype('U'))
treatments_df = pd.DataFrame(treatments.toarray(), columns=vectorizer.get_feature_names_out())
#Juntar ambos dataframes
usermessage_df = pd.concat([usermessage_df, treatments_df], axis=1)

usermessage_df.head()

Unnamed: 0,id,patient,creation_date,body,local_time,contains_emoji,quimioterapia,radioterapia,cirugía,inmunoterapia,...,biopsia,mastectomía,lumpectomía,resección,ablación,embolización,estereotáctica,radiocirugía,braquiterapia,crioablación
0,142022,883444400000.0,2023-03-23 14:55:52.409996+00:00,hoy le hicieron sus análisis y mañana pasa...,2023-03-23 09:55:52.409996-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1657,882674600000.0,2023-01-13 13:38:24.867173+00:00,"¡oh, entiendo! 🤗 ¿qué puedo hacer para ayudarte?",2023-01-13 08:38:24.867173-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1658,882780500000.0,2023-01-13 13:38:31.621684+00:00,¿cuál sería entonces tu nivel de dolor en la e...,2023-01-13 08:38:31.621684-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1664,882791100000.0,2023-01-13 13:38:46.718517+00:00,"entiendo, ¡es bueno que estés tranquila! 💙¿est...",2023-01-13 08:38:46.718517-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1665,883724800000.0,2023-01-13 13:38:54.380169+00:00,¡genial! ¿qué tal te sientes hoy en general?,2023-01-13 08:38:54.380169-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Target Variable

In [6]:
# Identifica si menciona la palabra 'hospital'
usermessage_df['body'] = usermessage_df['body'].astype('str')
usermessage_df['hospital_visit'] = usermessage_df['body'].str.contains('hospital|clinica|hospitalización').astype(int)
usermessage_df.head()

Unnamed: 0,id,patient,creation_date,body,local_time,contains_emoji,quimioterapia,radioterapia,cirugía,inmunoterapia,...,mastectomía,lumpectomía,resección,ablación,embolización,estereotáctica,radiocirugía,braquiterapia,crioablación,hospital_visit
0,142022,883444400000.0,2023-03-23 14:55:52.409996+00:00,hoy le hicieron sus análisis y mañana pasa...,2023-03-23 09:55:52.409996-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1657,882674600000.0,2023-01-13 13:38:24.867173+00:00,"¡oh, entiendo! 🤗 ¿qué puedo hacer para ayudarte?",2023-01-13 08:38:24.867173-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1658,882780500000.0,2023-01-13 13:38:31.621684+00:00,¿cuál sería entonces tu nivel de dolor en la e...,2023-01-13 08:38:31.621684-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1664,882791100000.0,2023-01-13 13:38:46.718517+00:00,"entiendo, ¡es bueno que estés tranquila! 💙¿est...",2023-01-13 08:38:46.718517-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1665,883724800000.0,2023-01-13 13:38:54.380169+00:00,¡genial! ¿qué tal te sientes hoy en general?,2023-01-13 08:38:54.380169-05:00,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Aclaracion:
Usaremos la columna 'hospital_visit' que hemos creado como nuestra variable objetivo, asumiendo que una mención de una visita al hospital en los mensajes del usuario indica una hospitalización

Esta suposición no es muy precisa, ya que los usuarios podrían mencionar visitas al hospital por otras razones que no sean una hospitalización (por ejemplo, citas de seguimiento, pruebas, etc.).

### NLP feature extraction

Analisis de sentimientos

In [7]:
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity
usermessage_df['sentiment'] = usermessage_df['body'].apply(calculate_sentiment)

Analisis de temas

In [8]:
texts = usermessage_df['body'].apply(lambda x: x.split())
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10) # Entramos el modelo que identifica los temas
usermessage_df['topics'] = [max(lda[c], key=lambda x: x[1])[0] for c in corpus]

Vectorizacion de palabras

In [9]:
model = Word2Vec(texts, vector_size=100, window=5, min_count=1, workers=4)
usermessage_df['word_vectors'] = texts.apply(lambda x: np.mean([model.wv[word] for word in x], axis=0))

Identificar entidades nombradas:
personas, lugares, organizaciones

In [10]:
# Cargamos el modelo de lenguaje para el español en Spacy
nlp = spacy.load('es_core_news_sm')

def extract_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]
usermessage_df['entities'] = usermessage_df['body'].apply(extract_entities)

In [11]:
columnas_a_eliminar = ['id','creation_date','local_time']
usermessage_df = usermessage_df.drop(columnas_a_eliminar, axis=1)

In [12]:
usermessage_df.head()

Unnamed: 0,patient,body,contains_emoji,quimioterapia,radioterapia,cirugía,inmunoterapia,terapia hormonal,terapia dirigida,radiación,...,embolización,estereotáctica,radiocirugía,braquiterapia,crioablación,hospital_visit,sentiment,topics,word_vectors,entities
0,883444400000.0,hoy le hicieron sus análisis y mañana pasa...,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,5,"[0.26801333, 0.06241709, 0.62745184, -0.774430...",[]
1,882674600000.0,"¡oh, entiendo! 🤗 ¿qué puedo hacer para ayudarte?",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,7,"[0.4362003, 0.13430993, 0.17844854, -0.1499801...","[¡oh, ! 🤗 ¿qué]"
2,882780500000.0,¿cuál sería entonces tu nivel de dolor en la e...,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,8,"[-0.085916124, 0.25686482, -0.009688669, -1.69...",[]
3,882791100000.0,"entiendo, ¡es bueno que estés tranquila! 💙¿est...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,7,"[-0.21102029, 0.09418106, 0.1307139, -0.597450...",[💙]
4,883724800000.0,¡genial! ¿qué tal te sientes hoy en general?,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.05,4,"[0.33298653, 0.04729259, 0.2622626, -0.9776957...",[¡genial!]


### Unir Datasets

In [13]:
final_merged_df = pd.merge(merged_df, usermessage_df, left_on='phone_number', right_on='patient')
# Check 
final_merged_df.head()

Unnamed: 0,phone_number,cancer_type,description,intensity,patient,body,contains_emoji,quimioterapia,radioterapia,cirugía,...,embolización,estereotáctica,radiocirugía,braquiterapia,crioablación,hospital_visit,sentiment,topics,word_vectors,entities
0,883434410131,cancer de mama,dolor corporal,6,883434400000.0,ok,False,0,0,0,...,0,0,0,0,0,0,0.5,2,"[-0.35875577, -0.16399889, 0.12710331, -0.2765...",[]
1,883434410131,cancer de mama,dolor corporal,6,883434400000.0,normal🙂,False,0,0,0,...,0,0,0,0,0,0,0.0,3,"[-0.5366856, 0.16400602, 0.24802597, -0.372481...",[🙂]
2,883434410131,cancer de mama,dolor corporal,6,883434400000.0,ok,False,0,0,0,...,0,0,0,0,0,0,0.5,2,"[-0.35875577, -0.16399889, 0.12710331, -0.2765...",[]
3,883434410131,cancer de mama,dolor corporal,6,883434400000.0,☹,True,0,0,0,...,0,0,0,0,0,0,0.0,4,"[-0.030986605, 0.04377968, 0.052830763, -0.087...",[]
4,883434410131,cancer de mama,dolor corporal,6,883434400000.0,tube dolores por un tapon que me pusieron,False,0,0,0,...,0,0,0,0,0,0,0.0,5,"[0.075512595, 0.9367525, 0.07870893, -1.668360...",[tapon]


# Modelado

In [15]:
# Seleccionamos nuestro target
target_column = 'hospital_visit'
# Seleccionamos nuestros features
feature_columns = ['cancer_type','description','intensity','contains_emoji'] + treatment_keywords + ['sentiment','topics','word_vectors','entities']

# Split
X_train, X_test, y_train, y_test = train_test_split(final_merged_df[feature_columns], final_merged_df[target_column], test_size=0.2, random_state=42)

# Normalizaremos las características numéricas utilizando la normalización Min-Max
numeric_features = treatment_keywords
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

# Codificaremos las características categóricas utilizando la codificación one-hot
categorical_features = ['cancer_type','description']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs', max_iter=1000))])

# Entrenamiento
clf.fit(X_train, y_train)

In [16]:
# Predecimos
y_pred = clf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9924651324597561

### Cross-validation y Recall

In [17]:
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)

# Calculamos Cross-Validation
mean_cv_score = np.mean(cv_scores)

# Calculamos Recall
recall = recall_score(y_test, y_pred)

mean_cv_score, recall

(0.9924308611076924, 0.0)