## Load data (from files, generated from a database)

In [None]:
# import libraries
import re
from re import split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
%matplotlib inline


In [None]:
from google.colab import drive
drive.mount('/content/gdrive') # donde gdrive = Mi unidad en google drive

In [None]:
#route ='C:/Users/diabetes.ml1/Downloads'
# route google drive
ruta = 'gdrive/My Drive/Colab Notebooks/diabetes_codigoCristhianPatino/Data_RiesgoHipoglicemia'

In [None]:
archivo1 = ruta + "/" + 'analisis_hemog_unbalanced.csv'
archivo2 = ruta + "/" + 'hemog_text_panel_complete.csv'

In [None]:
df1 = pd.read_csv(archivo1 ,sep=",", low_memory=False)
print(df1.shape)
df1.head(5)

In [None]:
df2 = pd.read_csv(archivo2 ,sep=",", low_memory=False)
print(df2.shape)
df2.head(5)

In [None]:
# colnames
df2.columns

In [None]:
# merge files
df3 = pd.merge(df1[['KeyAnonimo', 'fecha_consulta', 'HbA1c', 'analisis']], df2[['KeyAnonimo', 'edad', 'genero']], how = 'inner', on='KeyAnonimo')
df3 = df3.drop_duplicates()
df3.shape

In [None]:
df3 = df3.reset_index()
df3.drop(['index'], axis = 1, inplace = True)
df3

#Preprocessing

In [None]:
df3['fecha_consulta'] = pd.to_datetime(df3['fecha_consulta'], format = '%Y-%m-%d')

In [None]:
# adjust patient's age
df3.loc[df3['fecha_consulta'].isin(pd.date_range('2018-01-01', '2018-12-31')), 'edad'] = df3['edad'] - 3
df3.loc[df3['fecha_consulta'].isin(pd.date_range('2019-01-01', '2019-12-31')), 'edad'] = df3['edad'] - 2
df3.loc[df3['fecha_consulta'].isin(pd.date_range('2020-01-01', '2020-12-31')), 'edad'] = df3['edad'] - 1

In [None]:
df3

In [None]:
# number of unique patients
len(pd.unique(df3['KeyAnonimo']))

23802

In [None]:
# cl: contar por genero
patients_grouped = df3.groupby('KeyAnonimo').first()
male_count = patients_grouped[patients_grouped.genero == 'M'].genero.count()
female_count = patients_grouped[patients_grouped.genero == 'F'].genero.count()

In [None]:
male_count

In [None]:
female_count

In [None]:
# analyze age
grouped = df3.groupby('KeyAnonimo')

In [None]:
# average age per patient, during the 3 years of observation:
averages = grouped.mean() # determines the average for age and HbA1c, as numerical fields...
averages

In [None]:
# average age of all patients
averages.edad.mean()

In [None]:
# age: standard deviation of all patients
averages.edad.std()

In [None]:
# average age of the patient: assign the calculated average to an age dataframe
edades = pd.DataFrame(averages)
edades = edades.reset_index().rename(columns={'index': 'KeyAnonimo'})
list(edades.columns)

In [None]:
# age by range (groups)
bins = [18, 30, 40, 50, 60, 70, 120]
labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70+']
edades['rango_edad'] = pd.cut(edades.edad, bins, labels = labels,include_lowest = True)
edades['rango_edad'].cat.add_categories('unknown').fillna('unknown')

In [None]:
# count by age ranges
age_range_counts = edades.groupby('rango_edad')['rango_edad'].count()
print(age_range_counts)

In [None]:
# patient's average HbA1c: assign the calculated average to a dataframe
hemoglobina = pd.DataFrame(averages)
hemoglobina = hemoglobina.reset_index().rename(columns={'index': 'KeyAnonimo'})
list(hemoglobina.columns)

In [None]:
# descriptive statistics for HbA1c:
hemoglobina.HbA1c.describe()

In [None]:
# HbA1c by range
bins = [4.0, 4.8, 5.7, 6.4, 7.0, 8.0, 17]
labels = ['4.0-4.799', '4.8-5.699', '5.7-6.399', '6.4-6.999', '7.0-7.999', '8.0+']
hemoglobina['rango_hemoglobina'] = pd.cut(hemoglobina.HbA1c, bins, labels = labels,include_lowest = True)
hemoglobina['rango_hemoglobina'].cat.add_categories('unknown').fillna('unknown')

In [None]:
# count by HbA1c ranges / groups
hemoglobina_range_counts = hemoglobina.groupby('rango_hemoglobina')['rango_hemoglobina'].count()
print(hemoglobina_range_counts)

In [None]:
df3['analisis'] = df3['analisis'].astype(str)
#df3 = df3.iloc[:30000,:]
#df3

In [None]:
# to lower case
for i in range(0,df3.shape[0]):
  df3['analisis'][i] = df3.analisis[i].lower()

In [None]:
# basic cleaning for characters
quitar = ",;:.+-()/?´*%"

for i in range(0,df3.shape[0]):
    for caracter in quitar:
       df3.analisis[i] = df3.analisis[i].replace(caracter, " ")

In [None]:
# replace spanish characters
quitar    = ["á", "é", "í", "ó", "ú"]
remplazar = ["a", "e", "i", "o", "u"]

for j in range(0,df3.shape[0]):
    for i in range(0,5):
        df3.analisis[j] = df3.analisis[j].replace(quitar[i], remplazar[i])

df3

In [None]:
# save DF to csv
# df3.to_csv('DF_RiesgoHip.csv', sep=';')
# Here, the reading of the already preprocessed data begins (from gdrive)
archivo_final = ruta + "/" + 'DF_RiesgoHip.csv'

In [None]:
# reading the  already "curated" data (from google drive)
df3 = pd.read_csv(archivo_final ,sep=";", low_memory=False)
df3['analisis'] = df3['analisis'].astype(str)
print(df3.shape)
df3

# Identification "models"

## 1. Hipoglycemia - (model for identifying hypoglycemic events)

In [None]:
# read spanish text (analisis: physician's notes) for NLP
df_hipo = df3[["KeyAnonimo", 'analisis']]
df_hipo

In [None]:
# number of unique patients
# df_hipo.KeyAnonimo.value_counts()
df_hipo.KeyAnonimo.nunique()

In [None]:
# search terms - key words (list of words - "dictionary")
df_hipo['Indice_palabra'] = ''
df_hipo['Frase_extraida'] = ''
palabra1 = "hipoglicemia"
palabra2 = "hipoglucemia"
# added: 31/05/2023, based on meeting with Doc:
palabra3 = "baja azucar"
palabra4 = "azucar bajo"
palabra5 = "azucar con tendencia a la baja"

In [None]:
arraym = df_hipo['analisis']
arraym
#type(arraym)

In [None]:
# Search for the words and generate a new column "Indice_palabra" with the index ("start") for each record where the search word has been found
# Note: find returns an integer representing the index of where the search item was found. If it isn't found, it returns -1.
# for i in range(0, df_hipo.shape[0]):
#   if (arraym[i].find(palabra1)) != -1:
#     indice = arraym[i].find(palabra1)
#     df_hipo.Indice_palabra[i] = indice
#   else:
#     indice = arraym[i].find(palabra2)
#     df_hipo.Indice_palabra[i] = indice

# with 5 word for the keyword search (31/05/2023):
for i in range(0, df_hipo.shape[0]):
  if (arraym[i].find(palabra1)) != -1:
    indice = arraym[i].find(palabra1)
    df_hipo.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra2)) != -1:
    indice = arraym[i].find(palabra2)
    df_hipo.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra3)) != -1:
    indice = arraym[i].find(palabra3)
    df_hipo.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra4)) != -1:
    indice = arraym[i].find(palabra4)
    df_hipo.Indice_palabra[i] = indice
  else:
    indice = arraym[i].find(palabra5)
    df_hipo.Indice_palabra[i] = indice

In [None]:
# show the data with the new column that shows this index
df_hipo

In [None]:
# create a new column "Frase_extraida", which shows 150 places (characters) of text around the index of the word found in the previous search
# if the word is not found, then show the text "sin información"
for i in range(0, df_hipo.shape[0]):
    if (((df_hipo.Indice_palabra[i]) != -1) & ((df_hipo.Indice_palabra[i]) > 150)):
       df_hipo.Frase_extraida[i] = arraym[i][df_hipo.Indice_palabra[i]-150:df_hipo.Indice_palabra[i]+150]
    elif (((df_hipo.Indice_palabra[i]) != -1) & ((df_hipo.Indice_palabra[i]) <= 150)):
       df_hipo.Frase_extraida[i] = arraym[i][df_hipo.Indice_palabra[i]-df_hipo.Indice_palabra[i]:df_hipo.Indice_palabra[i]+150]
    else:
       df_hipo.Frase_extraida[i] = "Sin información"

df_hipo

In [None]:
# search for records containing search phrase 3,4 or 5.
df_hipo[df_hipo.analisis.str.contains('baja azucar | azucar bajo | azucar con tendencia a la baja')].shape[0]
# baja azucar: 1 registro
# azucar bajo: 8 registros
# azucar con tendencia a la baja: 0 registros
#df_hipo[df_hipo.analisis.str.contains('baja azucar | azucar bajo | azucar con tendencia a la baja')].to_excel("RH_hipo_conNuevasPalabr.xlsx")

In [None]:
# we download a sample of the data for the manual labeling by practicioning physicians
# data_train_hipo = df_hipo['Frase_extraida'].sample(n=20000, random_state=7)
# a sample of 20,000 records, regardless of whether there is text in the "Extracted_phrase" column or not ("No information")
# data_train_hipo.to_excel("RH_hipo.xlsx")

In [None]:
# ML MODEL
#df_etiquetas = pd.read_csv('C:/Users/diabetes.ml1/Downloads/Data_RiesgoHipoglicemia/Oraciones para etiquetar/1. RH_hipo_Ni_CSV.csv' ,sep=";", low_memory=False)

# read tagged (labeled) data (from gdrive)
ruta2 = 'gdrive/My Drive/Colab Notebooks/diabetes_codigoCristhianPatino/Data_RiesgoHipoglicemia/Oraciones para entrenamiento'
archivo_RH_hipo_Ni = ruta2 + "/" + '1. RH_hipo_Ni_CSV.csv'
df_etiquetas = pd.read_csv(archivo_RH_hipo_Ni ,sep=";", low_memory=False)
df_etiquetas = df_etiquetas[['Frase_extraida', 'Etiqueta']]
df_etiquetas

In [None]:
# count labels (classes)
df_etiquetas.value_counts('Etiqueta')

In [None]:
# graph the proportion of classes as a pie chart
plt.pie(df_etiquetas['Etiqueta'].value_counts(),
        labels = ['0','1'], autopct='%.2f%%');

In [None]:
df = df_etiquetas.values
Y = df[:,1:].astype(int)

In [None]:
# create training and test partition: 80/20
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_etiquetas, test_size = 0.2, random_state = 10, stratify = df_etiquetas['Etiqueta'])

X_train, y_train = train['Frase_extraida'], train['Etiqueta']
X_test, y_test = test['Frase_extraida'], test['Etiqueta']
X_total, y_total = df_etiquetas['Frase_extraida'], df_etiquetas['Etiqueta'] # todos los 393 registros


print(np.unique(Y,return_counts=True))
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
print(np.unique(y_total,return_counts=True))

In [None]:
# vectorization to convert words to numbers for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

total_x_vector = tfidf.fit_transform(X_total)
train_x_vector = tfidf.transform(X_train)
test_x_vector = tfidf.transform(X_test)

In [None]:
total_x_vector.shape

In [None]:
# run the different classification models, using grid search to find the optimal hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#DecisionTree
dt_param_grid = {'criterion':['gini','entropy']}
dt = DecisionTreeClassifier(random_state=11)
dt_cv = GridSearchCV(dt, dt_param_grid, cv=10)
dt_cv.fit(train_x_vector, y_train)

#SVM
sv_param_grid = {'kernel':['rbf','linear', 'poly'],
              'C': [0.5,1,10,50,100],
              'gamma':[0.1, 'auto']}
sv = SVC(random_state=11)
sv_cv = GridSearchCV(sv , sv_param_grid, cv=10)
sv_cv.fit(train_x_vector, y_train)

#G_NaiveBayes
nb = GaussianNB()
nb_accuracy_train = (cross_val_score(nb, train_x_vector.toarray(), y_train, cv=10)).mean()

#Regresión logistica
lg_param_grid = {'C':[1,10,100],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[7000]}
lg = LogisticRegression(random_state=11)
lg_cv= GridSearchCV(lg, lg_param_grid, cv=10)
lg_cv.fit(train_x_vector, y_train)

#KNN
knn_param_grid = {'n_neighbors':np.arange(1,21),
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = KNeighborsClassifier(n_jobs = -1)
knn_cv= GridSearchCV(knn, knn_param_grid,cv=10)
knn_cv.fit(train_x_vector,y_train)

#MLP
mlp_param_grid = {'hidden_layer_sizes':[10, 50],
              'activation':['identity', 'logistic', 'tanh', 'relu'],
              'solver':['lbfgs', 'sgd', 'adam']}
mlp = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000)
mlp_cv = GridSearchCV(mlp , mlp_param_grid, cv=10)
mlp_cv.fit(train_x_vector, y_train)


In [None]:
# ...to choose the best model (best score) with its parameters
print(f'DT: {dt_cv.best_score_}, {dt_cv.best_params_}')
print(f'SVM: {sv_cv.best_score_}, {sv_cv.best_params_}')
print(f'GNB: {nb_accuracy_train}')
print(f'LG: {lg_cv.best_score_}, {lg_cv.best_params_}')
print(f'KNN: {knn_cv.best_score_}, {knn_cv.best_params_}')
print(f'MLP: {mlp_cv.best_score_}, {mlp_cv.best_params_}')

In [None]:
# using the best model
modelo = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000, activation= 'tanh', hidden_layer_sizes= 50, solver= 'lbfgs')
modelo.fit(train_x_vector,y_train)

In [None]:
classes = np.unique(Y)
classes

In [None]:
# create a confusion matrix
cm=confusion_matrix(y_test, modelo.predict(test_x_vector))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap=plt.cm.Reds)

In [None]:
# show the main performance indicators: precision, recall, f1-score, support, accuracy
print(classification_report(y_test, modelo.predict(test_x_vector),
                      labels = classes))

In [None]:
# train with 100% of the data. That is, train + test =  total_x_vector.
modelo = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000, activation= 'tanh', hidden_layer_sizes= 50, solver= 'lbfgs')
modelo.fit(total_x_vector,y_total)

In [None]:
# label the rest of the dataset (performing the predictions)
# add a 'Hip_Model' column where the label of this columns is "predice", based on what has been learned we fill the Hip_Model column in df_hipo.Hip_Model.
df_hipo['Modelo_Hip'] = ''

for i in range(0, df_hipo.shape[0]):
    if ((df_hipo.Indice_palabra[i]) == -1):
        df_hipo.Modelo_Hip[i] = 2
    else:
        palabra = df_hipo.Frase_extraida[i]
        df_hipo.Modelo_Hip[i] = modelo.predict(tfidf.transform([palabra]))

df_hipo

In [None]:
# to count how many values of "2", "0" and "1" there are. Where label of "2", if a patient’s text does not include the search term or key word, the index or label "0",
# if at least one of these words is present but not directly related to a positive case of the symptom or occurrence of hypoglycemia. Finally,
# the index or label "1" is applied if the word is present and is directly related to a hypoglycemia / symptoms
df_hipo.Modelo_Hip.value_counts()

In [None]:
# save DF to csv
df_hipo.to_csv('Modelo_Hip.csv', sep=';')

## 2. Identification of symptoms of hypoglycemia  - "cognitive symptom" model

In [None]:
# read data
df_cog = df3[["KeyAnonimo", 'analisis']]
df_cog

In [None]:
# search terms - key words (list of words - "dictionary)
df_cog['Indice_palabra'] = ''
df_cog['Frase_extraida'] = ''
palabra1 = "cognitiv"
palabra2 = "desmayo"
palabra3 = "problemas de concentracion"
palabra4 = "desorient"
palabra5 = "uncion cognitiva"
palabra6 = "alteracion de la atencion"
palabra7 = "deterioro de la concentracion"
palabra8 = "concentrarse"
palabra9 = " conciencia"
palabra10 = "confus"

In [None]:
arraym = df_cog['analisis']
arraym

In [None]:
# performing the search...
for i in range(0, df_cog.shape[0]):
  if (arraym[i].find(palabra1)) != -1:
    indice = arraym[i].find(palabra1)
    df_cog.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra2)) != -1:
    indice = arraym[i].find(palabra2)
    df_cog.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra3)) != -1:
    indice = arraym[i].find(palabra3)
    df_cog.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra4)) != -1:
    indice = arraym[i].find(palabra4)
    df_cog.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra5)) != -1:
    indice = arraym[i].find(palabra5)
    df_cog.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra6)) != -1:
    indice = arraym[i].find(palabra6)
    df_cog.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra7)) != -1:
    indice = arraym[i].find(palabra7)
    df_cog.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra8)) != -1:
    indice = arraym[i].find(palabra8)
    df_cog.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra9)) != -1:
    indice = arraym[i].find(palabra9)
    df_cog.Indice_palabra[i] = indice
  else:
    indice = arraym[i].find(palabra10)
    df_cog.Indice_palabra[i] = indice

In [None]:
# create a new column "Frase_extraida" that displays 150 places (characters) of text around the index of the word found in the previous search
for i in range(0, df_cog.shape[0]):
    if (((df_cog.Indice_palabra[i]) != -1) & ((df_cog.Indice_palabra[i]) > 150)):
       df_cog.Frase_extraida[i] = arraym[i][df_cog.Indice_palabra[i]-150:df_cog.Indice_palabra[i]+150]
    elif (((df_cog.Indice_palabra[i]) != -1) & ((df_cog.Indice_palabra[i]) <= 150)):
       df_cog.Frase_extraida[i] = arraym[i][df_cog.Indice_palabra[i]-df_cog.Indice_palabra[i]:df_cog.Indice_palabra[i]+150]
    else:
       df_cog.Frase_extraida[i] = "Sin información"

df_cog

In [None]:
# use a sample of the data for manual labeling by practicing physicians
#data_train_cog = df_cog['Frase_extraida'].sample(n=30000, random_state=7)
#data_train_cog.to_excel("RH_cog.xlsx")
#data_train_cog

In [None]:
# ML MODEL
# read the labeled data
df_etiquetas = pd.read_csv('C:/Users/diabetes.ml1/Downloads/Data_RiesgoHipoglicemia/Oraciones para etiquetar/2. RH_cog_CP_CSV.csv' ,sep=";", low_memory=False)
df_etiquetas = df_etiquetas[['Frase_extraida', 'Etiqueta']]
df_etiquetas

In [None]:
df_etiquetas.value_counts('Etiqueta')

In [None]:
# graph the proportion of classes as a pie chart
plt.pie(df_etiquetas['Etiqueta'].value_counts(),
        labels = ['0','1'], autopct='%.2f%%');

In [None]:
df = df_etiquetas.values
Y = df[:,1:].astype(int)

In [None]:
# create training and test partition: 80/20
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_etiquetas, test_size = 0.2, random_state = 10, stratify = df_etiquetas['Etiqueta'])

X_train, y_train = train['Frase_extraida'], train['Etiqueta']
X_test, y_test = test['Frase_extraida'], test['Etiqueta']
X_total, y_total = df_etiquetas['Frase_extraida'], df_etiquetas['Etiqueta']


print(np.unique(Y,return_counts=True))
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
print(np.unique(y_total,return_counts=True))

In [None]:
# vectorization to convert words to numbers for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

total_x_vector = tfidf.fit_transform(X_total)
train_x_vector = tfidf.transform(X_train)
test_x_vector = tfidf.transform(X_test)

In [None]:
# run the different classification models, using grid search to find the optimal hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#DecisionTree
dt_param_grid = {'criterion':['gini','entropy']}
dt = DecisionTreeClassifier(random_state=11)
dt_cv = GridSearchCV(dt, dt_param_grid, cv=10)
dt_cv.fit(train_x_vector, y_train)

#SVM
sv_param_grid = {'kernel':['rbf','linear', 'poly'],
              'C': [0.5,1,10,50,100],
              'gamma':[0.1, 'auto']}
sv = SVC(random_state=11)
sv_cv = GridSearchCV(sv , sv_param_grid, cv=10)
sv_cv.fit(train_x_vector, y_train)

#G_NaiveBayes
nb = GaussianNB()
nb_accuracy_train = (cross_val_score(nb, train_x_vector.toarray(), y_train, cv=10)).mean()

#Regresión logistica
lg_param_grid = {'C':[1,10,100],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[7000]}
lg = LogisticRegression(random_state=11)
lg_cv= GridSearchCV(lg, lg_param_grid, cv=10)
lg_cv.fit(train_x_vector, y_train)

#KNN
knn_param_grid = {'n_neighbors':np.arange(1,21),
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = KNeighborsClassifier(n_jobs = -1)
knn_cv= GridSearchCV(knn, knn_param_grid,cv=10)
knn_cv.fit(train_x_vector,y_train)

#MLP
mlp_param_grid = {'hidden_layer_sizes':[10, 50],
              'activation':['identity', 'logistic', 'tanh', 'relu'],
              'solver':['lbfgs', 'sgd', 'adam']}
mlp = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000)
mlp_cv = GridSearchCV(mlp , mlp_param_grid, cv=10)
mlp_cv.fit(train_x_vector, y_train)


In [None]:
print(f'DT: {dt_cv.best_score_}, {dt_cv.best_params_}')
print(f'SVM: {sv_cv.best_score_}, {sv_cv.best_params_}')
print(f'GNB: {nb_accuracy_train}')
print(f'LG: {lg_cv.best_score_}, {lg_cv.best_params_}')
print(f'KNN: {knn_cv.best_score_}, {knn_cv.best_params_}')
print(f'MLP: {mlp_cv.best_score_}, {mlp_cv.best_params_}')

In [None]:
# use the best model
modelo = SVC(random_state=11, C = 10, gamma = 0.1, kernel = 'linear')
modelo.fit(train_x_vector,y_train)

In [None]:
classes = np.unique(Y)
classes

In [None]:
# create a confusion matrix
cm=confusion_matrix(y_test, modelo.predict(test_x_vector))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap=plt.cm.Reds)

In [None]:
print(classification_report(y_test, modelo.predict(test_x_vector),
                      labels = classes))

In [None]:
# train with 100% of the data
modelo = SVC(random_state=11, C = 10, gamma = 0.1, kernel = 'linear')
modelo.fit(total_x_vector,y_total)

In [None]:
# label the rest of the dataset (performing the predictions)
df_cog['Modelo_Cog'] = ''

for i in range(0, df_cog.shape[0]):
    if ((df_cog.Indice_palabra[i]) == -1):
        df_cog.Modelo_Cog[i] = 2
    else:
        palabra = df_cog.Frase_extraida[i]
        df_cog.Modelo_Cog[i] = modelo.predict(tfidf.transform([palabra]))

df_cog

In [None]:
# save DF as csv
df_cog.to_csv('Modelo_Cog.csv', sep=';')

## Identifcation of the "tremor symptom"

In [None]:
# read the data
df_temb = df3[["KeyAnonimo", 'analisis']]
df_temb

In [None]:
# search terms - key words (list of words - "dictionary")
df_temb['Indice_palabra'] = ''
df_temb['Frase_extraida'] = ''
palabra1 = "parestesia"
palabra2 = "temblor"
palabra3 = "hormigueo"
palabra4 = "sacudida"

In [None]:
arraym = df_temb['analisis']
arraym

In [None]:
# performing the search...
for i in range(0, df_temb.shape[0]):
  if (arraym[i].find(palabra1)) != -1:
    indice = arraym[i].find(palabra1)
    df_temb.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra2)) != -1:
    indice = arraym[i].find(palabra2)
    df_temb.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra3)) != -1:
    indice = arraym[i].find(palabra3)
    df_temb.Indice_palabra[i] = indice
  else:
    indice = arraym[i].find(palabra4)
    df_temb.Indice_palabra[i] = indice

In [None]:
# create a new column "Frase_extraida" that displays 150 characters of text around the index of the word found in the previous search.
for i in range(0, df_temb.shape[0]):
    if (((df_temb.Indice_palabra[i]) != -1) & ((df_temb.Indice_palabra[i]) > 150)):
       df_temb.Frase_extraida[i] = arraym[i][df_temb.Indice_palabra[i]-150:df_temb.Indice_palabra[i]+150]
    elif (((df_temb.Indice_palabra[i]) != -1) & ((df_temb.Indice_palabra[i]) <= 150)):
       df_temb.Frase_extraida[i] = arraym[i][df_temb.Indice_palabra[i]-df_temb.Indice_palabra[i]:df_temb.Indice_palabra[i]+150]
    else:
       df_temb.Frase_extraida[i] = "Sin información"

df_temb

In [None]:
# use a sample of the data for manual labeling by practicing physicians
#data_train_temb = df_temb['Frase_extraida'].sample(n=30000, random_state=7)
#data_train_temb.to_excel("RH_temb.xlsx")
#data_train_temb

In [None]:
# ML model
# read the labeled data
df_etiquetas = pd.read_csv('C:/Users/diabetes.ml1/Downloads/Data_RiesgoHipoglicemia/Oraciones para etiquetar/3. RH_temb_CP_CSV.csv' ,sep=";", low_memory=False)
df_etiquetas = df_etiquetas[['Frase_extraida', 'Etiqueta']]
df_etiquetas

In [None]:
df_etiquetas.value_counts('Etiqueta')

In [None]:
#  graph the proportion of classes as a pie chart
plt.pie(df_etiquetas['Etiqueta'].value_counts(),
        labels = ['0','1'], autopct='%.2f%%');

In [None]:
df = df_etiquetas.values
Y = df[:,1:].astype(int)

In [None]:
# create training and test partition: 80/20
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_etiquetas, test_size = 0.2, random_state = 10, stratify = df_etiquetas['Etiqueta'])

X_train, y_train = train['Frase_extraida'], train['Etiqueta']
X_test, y_test = test['Frase_extraida'], test['Etiqueta']
X_total, y_total = df_etiquetas['Frase_extraida'], df_etiquetas['Etiqueta']


print(np.unique(Y,return_counts=True))
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
print(np.unique(y_total,return_counts=True))

In [None]:
# vectorization to convert words to numbers for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

total_x_vector = tfidf.fit_transform(X_total)
train_x_vector = tfidf.transform(X_train)
test_x_vector = tfidf.transform(X_test)

In [None]:
# run the different classification models, using grid search to find the optimal hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#DecisionTree
dt_param_grid = {'criterion':['gini','entropy']}
dt = DecisionTreeClassifier(random_state=11)
dt_cv = GridSearchCV(dt, dt_param_grid, cv=10)
dt_cv.fit(train_x_vector, y_train)

#SVM
sv_param_grid = {'kernel':['rbf','linear', 'poly'],
              'C': [0.5,1,10,50,100],
              'gamma':[0.1, 'auto']}
sv = SVC(random_state=11)
sv_cv = GridSearchCV(sv , sv_param_grid, cv=10)
sv_cv.fit(train_x_vector, y_train)

#G_NaiveBayes
nb = GaussianNB()
nb_accuracy_train = (cross_val_score(nb, train_x_vector.toarray(), y_train, cv=10)).mean()

#Regresión logistica
lg_param_grid = {'C':[1,10,100],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[7000]}
lg = LogisticRegression(random_state=11)
lg_cv= GridSearchCV(lg, lg_param_grid, cv=10)
lg_cv.fit(train_x_vector, y_train)

#KNN
knn_param_grid = {'n_neighbors':np.arange(1,21),
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = KNeighborsClassifier(n_jobs = -1)
knn_cv= GridSearchCV(knn, knn_param_grid,cv=10)
knn_cv.fit(train_x_vector,y_train)

#MLP
#mlp_param_grid = {'hidden_layer_sizes':[10, 50],
#              'activation':['identity', 'logistic', 'tanh', 'relu'],
#              'solver':['lbfgs', 'sgd', 'adam']}
#mlp = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000)
#mlp_cv = GridSearchCV(mlp , mlp_param_grid, cv=10)
#mlp_cv.fit(train_x_vector, y_train)


In [None]:
print(f'DT: {dt_cv.best_score_}, {dt_cv.best_params_}')
print(f'SVM: {sv_cv.best_score_}, {sv_cv.best_params_}')
print(f'GNB: {nb_accuracy_train}')
print(f'LG: {lg_cv.best_score_}, {lg_cv.best_params_}')
print(f'KNN: {knn_cv.best_score_}, {knn_cv.best_params_}')
#print(f'MLP: {mlp_cv.best_score_}, {mlp_cv.best_params_}')

In [None]:
# using the best model
modelo = SVC(random_state=11, C = 1, gamma = 0.1, kernel = 'linear')
modelo.fit(train_x_vector,y_train)

In [None]:
classes = np.unique(Y)
classes

In [None]:
# create a confusion matrix
cm=confusion_matrix(y_test, modelo.predict(test_x_vector))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap=plt.cm.Reds)

In [None]:
# display the main performance indicators: precision, recall, f1-score, support, accuracy
print(classification_report(y_test, modelo.predict(test_x_vector),
                      labels = classes))

In [None]:
# train with 100% of the data
modelo = SVC(random_state=11, C = 1, gamma = 0.1, kernel = 'linear')
modelo.fit(total_x_vector,y_total)

In [None]:
# label the rest of the dataset (performing the predictions)
df_temb['Modelo_temb'] = ''

for i in range(0, df_temb.shape[0]):
    if ((df_temb.Indice_palabra[i]) == -1):
        df_temb.Modelo_temb[i] = 2
    else:
        palabra = df_temb.Frase_extraida[i]
        df_temb.Modelo_temb[i] = modelo.predict(tfidf.transform([palabra]))

df_temb

In [None]:
# save DF to csv
df_temb.to_csv('Modelo_temb.csv', sep=';')

## Identification of the "cardiac symptom"

In [None]:
# read the data
df_card = df3[["KeyAnonimo", 'analisis']]
df_card

In [None]:
# search terms - key words (list of words - "dictionary")
df_card['Indice_palabra'] = ''
df_card['Frase_extraida'] = ''
palabra1 = "frecuencia cardiac"
palabra2 = "taquicardia"
palabra3 = "palpit"
palabra4 = "latido"
palabra5 = "latidos cardiaco"
palabra6 = "ritmo"
palabra7 = " fc "

In [None]:
# performing the search...
for i in range(0, df_card.shape[0]):
  if (arraym[i].find(palabra1)) != -1:
    indice = arraym[i].find(palabra1)
    df_card.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra2)) != -1:
    indice = arraym[i].find(palabra2)
    df_card.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra3)) != -1:
    indice = arraym[i].find(palabra3)
    df_card.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra4)) != -1:
    indice = arraym[i].find(palabra4)
    df_card.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra5)) != -1:
    indice = arraym[i].find(palabra5)
    df_card.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra6)) != -1:
    indice = arraym[i].find(palabra6)
    df_card.Indice_palabra[i] = indice
  else:
    indice = arraym[i].find(palabra7)
    df_card.Indice_palabra[i] = indice

In [None]:
# create a new column "Frase_extraida" that displays 150 characters of text around the index of the word found in the previous search.
for i in range(0, df_card.shape[0]):
    if (((df_card.Indice_palabra[i]) != -1) & ((df_card.Indice_palabra[i]) > 150)):
       df_card.Frase_extraida[i] = arraym[i][df_card.Indice_palabra[i]-150:df_card.Indice_palabra[i]+150]
    elif (((df_card.Indice_palabra[i]) != -1) & ((df_card.Indice_palabra[i]) <= 150)):
       df_card.Frase_extraida[i] = arraym[i][df_card.Indice_palabra[i]-df_card.Indice_palabra[i]:df_card.Indice_palabra[i]+150]
    else:
       df_card.Frase_extraida[i] = "Sin información"

df_card

In [None]:
# use a sample of the data for manual labeling by practicing physicians
#data_train_card = df_card['Frase_extraida'].sample(n=20000, random_state=7)
#data_train_card.to_excel("RH_card.xlsx")
#data_train_card

In [None]:
# ML Model
# read the labeled data
df_etiquetas = pd.read_csv('C:/Users/diabetes.ml1/Downloads/Data_RiesgoHipoglicemia/Oraciones para etiquetar/4. RH_card_CP_CSV.csv' ,sep=";", low_memory=False)
df_etiquetas = df_etiquetas[['Frase_extraida', 'Etiqueta']]
df_etiquetas

In [None]:
df_etiquetas.value_counts('Etiqueta')

In [None]:
# graph the proportion of classes as a pie chart
plt.pie(df_etiquetas['Etiqueta'].value_counts(),
        labels = ['0','1'], autopct='%.2f%%');

In [None]:
df = df_etiquetas.values
Y = df[:,1:].astype(int)

In [None]:
# create training and test partition: 80/20
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_etiquetas, test_size = 0.2, random_state = 10, stratify = df_etiquetas['Etiqueta'])

X_train, y_train = train['Frase_extraida'], train['Etiqueta']
X_test, y_test = test['Frase_extraida'], test['Etiqueta']
X_total, y_total = df_etiquetas['Frase_extraida'], df_etiquetas['Etiqueta']


print(np.unique(Y,return_counts=True))
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
print(np.unique(y_total,return_counts=True))

In [None]:
# vectorization to convert words to numbers for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

total_x_vector = tfidf.fit_transform(X_total)
train_x_vector = tfidf.transform(X_train)
test_x_vector = tfidf.transform(X_test)

In [None]:
# run the different classification models, using grid search to find the optimal hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#DecisionTree
dt_param_grid = {'criterion':['gini','entropy']}
dt = DecisionTreeClassifier(random_state=11)
dt_cv = GridSearchCV(dt, dt_param_grid, cv=10)
dt_cv.fit(train_x_vector, y_train)

#SVM
sv_param_grid = {'kernel':['rbf','linear', 'poly'],
              'C': [0.5,1,10,50,100],
              'gamma':[0.1, 'auto']}
sv = SVC(random_state=11)
sv_cv = GridSearchCV(sv , sv_param_grid, cv=10)
sv_cv.fit(train_x_vector, y_train)

#G_NaiveBayes
nb = GaussianNB()
nb_accuracy_train = (cross_val_score(nb, train_x_vector.toarray(), y_train, cv=10)).mean()

#Regresión logistica
lg_param_grid = {'C':[1,10,100],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[7000]}
lg = LogisticRegression(random_state=11)
lg_cv= GridSearchCV(lg, lg_param_grid, cv=10)
lg_cv.fit(train_x_vector, y_train)

#KNN
knn_param_grid = {'n_neighbors':np.arange(1,21),
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = KNeighborsClassifier(n_jobs = -1)
knn_cv= GridSearchCV(knn, knn_param_grid,cv=10)
knn_cv.fit(train_x_vector,y_train)

#MLP
#mlp_param_grid = {'hidden_layer_sizes':[10, 50],
#              'activation':['identity', 'logistic', 'tanh', 'relu'],
#              'solver':['lbfgs', 'sgd', 'adam']}
#mlp = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000)
#mlp_cv = GridSearchCV(mlp , mlp_param_grid, cv=10)
#mlp_cv.fit(train_x_vector, y_train)


In [None]:
print(f'DT: {dt_cv.best_score_}, {dt_cv.best_params_}')
print(f'SVM: {sv_cv.best_score_}, {sv_cv.best_params_}')
print(f'GNB: {nb_accuracy_train}')
print(f'LG: {lg_cv.best_score_}, {lg_cv.best_params_}')
print(f'KNN: {knn_cv.best_score_}, {knn_cv.best_params_}')
#print(f'MLP: {mlp_cv.best_score_}, {mlp_cv.best_params_}')

In [None]:
# using the best model
modelo = DecisionTreeClassifier(random_state=11, criterion = 'gini')
modelo.fit(train_x_vector,y_train)

In [None]:
classes = np.unique(Y)
classes

In [None]:
# create a confusion matrix
cm=confusion_matrix(y_test, modelo.predict(test_x_vector))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap=plt.cm.Reds)

In [None]:
# display the main performance indicators: precision, recall, f1-score, support, accuracy
print(classification_report(y_test, modelo.predict(test_x_vector),
                      labels = classes))

In [None]:
# train with 100% of the data
modelo = DecisionTreeClassifier(random_state=11, criterion = 'gini')
modelo.fit(total_x_vector,y_total)

In [None]:
# label the rest of the dataset (performing the predictions)
df_card['Modelo_card'] = ''

for i in range(0, df_card.shape[0]):
    if ((df_card.Indice_palabra[i]) == -1):
        df_card.Modelo_card[i] = 2
    else:
        palabra = df_card.Frase_extraida[i]
        df_card.Modelo_card[i] = modelo.predict(tfidf.transform([palabra]))

df_card

In [None]:
# save DF to csv
df_card.to_csv('Modelo_card.csv', sep=';')

## Identification of the "vision symptom"

In [None]:
# read the data
df_vis = df3[["KeyAnonimo", 'analisis']]
df_vis

In [None]:
# search terms - key words (list of words - "dictionary")
df_vis['Indice_palabra'] = ''
df_vis['Frase_extraida'] = ''
palabra1 = "vision borros"
palabra2 = " vision"
palabra3 = " visual"
palabra4 = "enfocar"
palabra5 = "perdida visual"
palabra6 = " ver "

In [None]:
arraym = df_vis['analisis']
arraym

In [None]:
# performing the search...
for i in range(0, df_vis.shape[0]):
  if (arraym[i].find(palabra1)) != -1:
    indice = arraym[i].find(palabra1)
    df_vis.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra2)) != -1:
    indice = arraym[i].find(palabra2)
    df_vis.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra3)) != -1:
    indice = arraym[i].find(palabra3)
    df_vis.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra4)) != -1:
    indice = arraym[i].find(palabra4)
    df_vis.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra5)) != -1:
    indice = arraym[i].find(palabra5)
    df_vis.Indice_palabra[i] = indice
  else:
    indice = arraym[i].find(palabra6)
    df_vis.Indice_palabra[i] = indice

In [None]:
# create a new column "Frase_extraida" that displays 150 characters of text around the index of the word found in the previous search.
for i in range(0, df_vis.shape[0]):
    if (((df_vis.Indice_palabra[i]) != -1) & ((df_vis.Indice_palabra[i]) > 150)):
       df_vis.Frase_extraida[i] = arraym[i][df_vis.Indice_palabra[i]-150:df_vis.Indice_palabra[i]+150]
    elif (((df_vis.Indice_palabra[i]) != -1) & ((df_vis.Indice_palabra[i]) <= 150)):
       df_vis.Frase_extraida[i] = arraym[i][df_vis.Indice_palabra[i]-df_vis.Indice_palabra[i]:df_vis.Indice_palabra[i]+150]
    else:
       df_vis.Frase_extraida[i] = "Sin información"

df_vis

In [None]:
# use a sample of the data for manual labeling by practicing physicians
#data_train_vis = df_vis['Frase_extraida'].sample(n=20000, random_state=7)
#data_train_vis.to_excel("RH_vis.xlsx")
#data_train_vis

In [None]:
# ML Model
# read the labeled data
df_etiquetas = pd.read_csv('C:/Users/diabetes.ml1/Downloads/Data_RiesgoHipoglicemia/Oraciones para etiquetar/5. RH_vis_CP_CSV.csv' ,sep=";", low_memory=False)
df_etiquetas = df_etiquetas[['Frase_extraida', 'Etiqueta']]
df_etiquetas

In [None]:
df_etiquetas.value_counts('Etiqueta')

In [None]:
#  graph the proportion of classes as a pie chart
plt.pie(df_etiquetas['Etiqueta'].value_counts(),
        labels = ['0','1'], autopct='%.2f%%');

In [None]:
df = df_etiquetas.values
Y = df[:,1:].astype(int)

In [None]:
# create training and test partition: 80/20
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_etiquetas, test_size = 0.2, random_state = 10, stratify = df_etiquetas['Etiqueta'])

X_train, y_train = train['Frase_extraida'], train['Etiqueta']
X_test, y_test = test['Frase_extraida'], test['Etiqueta']
X_total, y_total = df_etiquetas['Frase_extraida'], df_etiquetas['Etiqueta']


print(np.unique(Y,return_counts=True))
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
print(np.unique(y_total,return_counts=True))

In [None]:
# vectorization to convert words to numbers for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

total_x_vector = tfidf.fit_transform(X_total)
train_x_vector = tfidf.transform(X_train)
test_x_vector = tfidf.transform(X_test)

In [None]:
# run the different classification models, using grid search to find the optimal hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#DecisionTree
dt_param_grid = {'criterion':['gini','entropy']}
dt = DecisionTreeClassifier(random_state=11)
dt_cv = GridSearchCV(dt, dt_param_grid, cv=10)
dt_cv.fit(train_x_vector, y_train)

#SVM
sv_param_grid = {'kernel':['rbf','linear', 'poly'],
              'C': [0.5,1,10,50,100],
              'gamma':[0.1, 'auto']}
sv = SVC(random_state=11)
sv_cv = GridSearchCV(sv , sv_param_grid, cv=10)
sv_cv.fit(train_x_vector, y_train)

#G_NaiveBayes
nb = GaussianNB()
nb_accuracy_train = (cross_val_score(nb, train_x_vector.toarray(), y_train, cv=10)).mean()

#Regresión logistica
lg_param_grid = {'C':[1,10,100],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[7000]}
lg = LogisticRegression(random_state=11)
lg_cv= GridSearchCV(lg, lg_param_grid, cv=10)
lg_cv.fit(train_x_vector, y_train)

#KNN
knn_param_grid = {'n_neighbors':np.arange(1,21),
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = KNeighborsClassifier(n_jobs = -1)
knn_cv= GridSearchCV(knn, knn_param_grid,cv=10)
knn_cv.fit(train_x_vector,y_train)

#MLP
#mlp_param_grid = {'hidden_layer_sizes':[10, 50],
#              'activation':['identity', 'logistic', 'tanh', 'relu'],
#              'solver':['lbfgs', 'sgd', 'adam']}
#mlp = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000)
#mlp_cv = GridSearchCV(mlp , mlp_param_grid, cv=10)
#mlp_cv.fit(train_x_vector, y_train)


In [None]:
print(f'DT: {dt_cv.best_score_}, {dt_cv.best_params_}')
print(f'SVM: {sv_cv.best_score_}, {sv_cv.best_params_}')
print(f'GNB: {nb_accuracy_train}')
print(f'LG: {lg_cv.best_score_}, {lg_cv.best_params_}')
print(f'KNN: {knn_cv.best_score_}, {knn_cv.best_params_}')
#print(f'MLP: {mlp_cv.best_score_}, {mlp_cv.best_params_}')

In [None]:
# using the best model
modelo = SVC(random_state=11, C = 1, gamma = 0.1, kernel = 'linear' )
modelo.fit(train_x_vector,y_train)

In [None]:
classes = np.unique(Y)
classes

In [None]:
# create a confusion matrix
cm=confusion_matrix(y_test, modelo.predict(test_x_vector))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap=plt.cm.Reds)

In [None]:
# display the main performance indicators: precision, recall, f1-score, support, accuracy
print(classification_report(y_test, modelo.predict(test_x_vector),
                      labels = classes))

In [None]:
# train with 100% of the data
modelo = SVC(random_state=11, C = 1, gamma = 0.1, kernel = 'linear' )
modelo.fit(total_x_vector,y_total)

In [None]:
# label the rest of the dataset (performing the predictions)
df_vis['Modelo_vis'] = ''

for i in range(0, df_vis.shape[0]):
    if ((df_vis.Indice_palabra[i]) == -1):
        df_vis.Modelo_vis[i] = 2
    else:
        palabra = df_vis.Frase_extraida[i]
        df_vis.Modelo_vis[i] = modelo.predict(tfidf.transform([palabra]))

df_vis

In [None]:
# Save DF to csv
df_vis.to_csv('Modelo_vis.csv', sep=';')

## Identification of the "irritability symptom"

In [None]:
# read the data
df_irri = df3[["KeyAnonimo", 'analisis']]
df_irri

In [None]:
# search terms - key words (list of words - "dictionary")
df_irri['Indice_palabra'] = ''
df_irri['Frase_extraida'] = ''
palabra1 = "frenetic"
palabra2 = "ansios"
palabra3 = "ansiedad"
palabra4 = "angustia"
palabra5 = "nervios"
palabra6 = "depresi"
palabra7 = "locura"
palabra8 = "triste" #irritable
palabra9 = "estres" #alterad

In [None]:
arraym = df_irri['analisis']
arraym

In [None]:
# performing the search...
for i in range(0, df_irri.shape[0]):
  if (arraym[i].find(palabra1)) != -1:
    indice = arraym[i].find(palabra1)
    df_irri.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra2)) != -1:
    indice = arraym[i].find(palabra2)
    df_irri.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra3)) != -1:
    indice = arraym[i].find(palabra3)
    df_irri.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra4)) != -1:
    indice = arraym[i].find(palabra4)
    df_irri.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra5)) != -1:
    indice = arraym[i].find(palabra5)
    df_irri.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra6)) != -1:
    indice = arraym[i].find(palabra6)
    df_irri.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra7)) != -1:
    indice = arraym[i].find(palabra7)
    df_irri.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra8)) != -1:
    indice = arraym[i].find(palabra8)
    df_irri.Indice_palabra[i] = indice
  else:
    indice = arraym[i].find(palabra9)
    df_irri.Indice_palabra[i] = indice

In [None]:
# create a new column "Frase_extraida" that displays 150 characters of text around the index of the word found in the previous search.
for i in range(0, df_irri.shape[0]):
    if (((df_irri.Indice_palabra[i]) != -1) & ((df_irri.Indice_palabra[i]) > 150)):
       df_irri.Frase_extraida[i] = arraym[i][df_irri.Indice_palabra[i]-150:df_irri.Indice_palabra[i]+150]
    elif (((df_irri.Indice_palabra[i]) != -1) & ((df_irri.Indice_palabra[i]) <= 150)):
       df_irri.Frase_extraida[i] = arraym[i][df_irri.Indice_palabra[i]-df_irri.Indice_palabra[i]:df_irri.Indice_palabra[i]+150]
    else:
       df_irri.Frase_extraida[i] = "Sin información"

df_irri

In [None]:
# use a sample of the data for manual labeling by practicing physicians
#data_train_irri = df_irri['Frase_extraida'].sample(n=30000, random_state=7)
#data_train_irri.to_excel("RH_irri.xlsx")
#data_train_irri

In [None]:
# ML Model
# read the labeled data
df_etiquetas = pd.read_csv('C:/Users/diabetes.ml1/Downloads/Data_RiesgoHipoglicemia/Oraciones para etiquetar/6. RH_irri_CP_CSV.csv' ,sep=";", low_memory=False)
df_etiquetas = df_etiquetas[['Frase_extraida', 'Etiqueta']]
df_etiquetas

In [None]:
df_etiquetas.value_counts('Etiqueta')

In [None]:
# graph the proportion of classes as a pie chart
plt.pie(df_etiquetas['Etiqueta'].value_counts(),
        labels = ['0','1'], autopct='%.2f%%');

In [None]:
df = df_etiquetas.values
Y = df[:,1:].astype(int)

In [None]:
# create training and test partition: 80/20
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_etiquetas, test_size = 0.2, random_state = 10, stratify = df_etiquetas['Etiqueta'])

X_train, y_train = train['Frase_extraida'], train['Etiqueta']
X_test, y_test = test['Frase_extraida'], test['Etiqueta']
X_total, y_total = df_etiquetas['Frase_extraida'], df_etiquetas['Etiqueta']

print(np.unique(Y,return_counts=True))
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
print(np.unique(y_total,return_counts=True))

In [None]:
# vectorization to convert words to numbers for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

total_x_vector = tfidf.fit_transform(X_total)
train_x_vector = tfidf.transform(X_train)
test_x_vector = tfidf.transform(X_test)

In [None]:
# run the different classification models, using grid search to find the optimal hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#DecisionTree
dt_param_grid = {'criterion':['gini','entropy']}
dt = DecisionTreeClassifier(random_state=11)
dt_cv = GridSearchCV(dt, dt_param_grid, cv=10)
dt_cv.fit(train_x_vector, y_train)

#SVM
sv_param_grid = {'kernel':['rbf','linear', 'poly'],
              'C': [0.5,1,10,50,100],
              'gamma':[0.1, 'auto']}
sv = SVC(random_state=11)
sv_cv = GridSearchCV(sv , sv_param_grid, cv=10)
sv_cv.fit(train_x_vector, y_train)

#G_NaiveBayes
nb = GaussianNB()
nb_accuracy_train = (cross_val_score(nb, train_x_vector.toarray(), y_train, cv=10)).mean()

#Regresión logistica
lg_param_grid = {'C':[1,10,100],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[7000]}
lg = LogisticRegression(random_state=11)
lg_cv= GridSearchCV(lg, lg_param_grid, cv=10)
lg_cv.fit(train_x_vector, y_train)

#KNN
knn_param_grid = {'n_neighbors':np.arange(1,21),
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = KNeighborsClassifier(n_jobs = -1)
knn_cv= GridSearchCV(knn, knn_param_grid,cv=10)
knn_cv.fit(train_x_vector,y_train)

#MLP
#mlp_param_grid = {'hidden_layer_sizes':[10, 50],
#              'activation':['identity', 'logistic', 'tanh', 'relu'],
#              'solver':['lbfgs', 'sgd', 'adam']}
#mlp = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000)
#mlp_cv = GridSearchCV(mlp , mlp_param_grid, cv=10)
#mlp_cv.fit(train_x_vector, y_train)


In [None]:
print(f'DT: {dt_cv.best_score_}, {dt_cv.best_params_}')
print(f'SVM: {sv_cv.best_score_}, {sv_cv.best_params_}')
print(f'GNB: {nb_accuracy_train}')
print(f'LG: {lg_cv.best_score_}, {lg_cv.best_params_}')
print(f'KNN: {knn_cv.best_score_}, {knn_cv.best_params_}')
#print(f'MLP: {mlp_cv.best_score_}, {mlp_cv.best_params_}')

In [None]:
# using the best model
modelo = SVC(random_state=11, C = 10, gamma = 0.1, kernel = 'rbf')
modelo.fit(train_x_vector,y_train)

In [None]:
classes = np.unique(Y)
classes

In [None]:
# create a confusion matrix
cm=confusion_matrix(y_test, modelo.predict(test_x_vector))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap=plt.cm.Reds)

In [None]:
# display the main performance indicators: precision, recall, f1-score, support, accuracy
print(classification_report(y_test, modelo.predict(test_x_vector),
                      labels = classes))

In [None]:
# train with 100% of the data
modelo = SVC(random_state=11, C = 10, gamma = 0.1, kernel = 'rbf')
modelo.fit(total_x_vector,y_total)

In [None]:
# label the rest of the dataset (performing the predictions)
df_irri['Modelo_irri'] = ''

for i in range(0, df_irri.shape[0]):
    if ((df_irri.Indice_palabra[i]) == -1):
        df_irri.Modelo_irri[i] = 2
    else:
        palabra = df_irri.Frase_extraida[i]
        df_irri.Modelo_irri[i] = modelo.predict(tfidf.transform([palabra]))

df_irri

In [None]:
# save DF to csv
df_irri.to_csv('Modelo_irri.csv', sep=';')

## Identification of the "sweating symptom"

In [None]:
# read the data
df_sud = df3[["KeyAnonimo", 'analisis']]
df_sud

In [None]:
# search terms - key words (list of words - "dictionary")
df_sud['Indice_palabra'] = ''
df_sud['Frase_extraida'] = ''
palabra1 = "diaforesis"
palabra2 = "hiperhidrosis"
palabra3 = "sudor"
palabra4 = "transpir"
palabra5 = "suda "

In [None]:
arraym = df_sud['analisis']
arraym

In [None]:
# performing the search...
for i in range(0, df_sud.shape[0]):
  if (arraym[i].find(palabra1)) != -1:
    indice = arraym[i].find(palabra1)
    df_sud.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra2)) != -1:
    indice = arraym[i].find(palabra2)
    df_sud.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra3)) != -1:
    indice = arraym[i].find(palabra3)
    df_sud.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra4)) != -1:
    indice = arraym[i].find(palabra4)
    df_sud.Indice_palabra[i] = indice
  else:
    indice = arraym[i].find(palabra5)
    df_sud.Indice_palabra[i] = indice

In [None]:
# create a new column "Frase_extraida" that displays 150 characters of text around the index of the word found in the previous search.
for i in range(0, df_sud.shape[0]):
    if (((df_sud.Indice_palabra[i]) != -1) & ((df_sud.Indice_palabra[i]) > 150)):
       df_sud.Frase_extraida[i] = arraym[i][df_sud.Indice_palabra[i]-150:df_sud.Indice_palabra[i]+150]
    elif (((df_sud.Indice_palabra[i]) != -1) & ((df_sud.Indice_palabra[i]) <= 150)):
       df_sud.Frase_extraida[i] = arraym[i][df_sud.Indice_palabra[i]-df_sud.Indice_palabra[i]:df_sud.Indice_palabra[i]+150]
    else:
       df_sud.Frase_extraida[i] = "Sin información"

df_sud

In [None]:
# use a sample of the data for manual labeling by practicing physicians
#data_train_sud = df_sud['Frase_extraida'].sample(n=60000, random_state=7)
#data_train_sud.to_excel("RH_sud.xlsx")
#data_train_sud

In [None]:
# ML Model
# read the labeled data
df_etiquetas = pd.read_csv('C:/Users/diabetes.ml1/Downloads/Data_RiesgoHipoglicemia/Oraciones para etiquetar/7. RH_sud_CP_CSV.csv' ,sep=";", low_memory=False)
df_etiquetas = df_etiquetas[['Frase_extraida', 'Etiqueta']]
df_etiquetas

In [None]:
df_etiquetas.value_counts('Etiqueta')

In [None]:
# graph the proportion of classes as a pie chart
plt.pie(df_etiquetas['Etiqueta'].value_counts(),
        labels = ['0','1'], autopct='%.2f%%');

In [None]:
df = df_etiquetas.values
Y = df[:,1:].astype(int)

In [None]:
# create training and test partition: 80/20
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_etiquetas, test_size = 0.2, random_state = 10, stratify = df_etiquetas['Etiqueta'])

X_train, y_train = train['Frase_extraida'], train['Etiqueta']
X_test, y_test = test['Frase_extraida'], test['Etiqueta']
X_total, y_total = df_etiquetas['Frase_extraida'], df_etiquetas['Etiqueta']


print(np.unique(Y,return_counts=True))
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
print(np.unique(y_total,return_counts=True))

In [None]:
# vectorization to convert words to numbers for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

total_x_vector = tfidf.fit_transform(X_total)
train_x_vector = tfidf.transform(X_train)
test_x_vector = tfidf.transform(X_test)

In [None]:
# run the different classification models, using grid search to find the optimal hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#DecisionTree
dt_param_grid = {'criterion':['gini','entropy']}
dt = DecisionTreeClassifier(random_state=11)
dt_cv = GridSearchCV(dt, dt_param_grid, cv=10)
dt_cv.fit(train_x_vector, y_train)

#SVM
sv_param_grid = {'kernel':['rbf','linear', 'poly'],
              'C': [0.5,1,10,50,100],
              'gamma':[0.1, 'auto']}
sv = SVC(random_state=11)
sv_cv = GridSearchCV(sv , sv_param_grid, cv=10)
sv_cv.fit(train_x_vector, y_train)

#G_NaiveBayes
nb = GaussianNB()
nb_accuracy_train = (cross_val_score(nb, train_x_vector.toarray(), y_train, cv=10)).mean()

#Regresión logistica
lg_param_grid = {'C':[1,10,100],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[7000]}
lg = LogisticRegression(random_state=11)
lg_cv= GridSearchCV(lg, lg_param_grid, cv=10)
lg_cv.fit(train_x_vector, y_train)

#KNN
knn_param_grid = {'n_neighbors':np.arange(1,21),
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = KNeighborsClassifier(n_jobs = -1)
knn_cv= GridSearchCV(knn, knn_param_grid,cv=10)
knn_cv.fit(train_x_vector,y_train)

#MLP
#mlp_param_grid = {'hidden_layer_sizes':[10, 50],
#              'activation':['identity', 'logistic', 'tanh', 'relu'],
#              'solver':['lbfgs', 'sgd', 'adam']}
#mlp = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000)
#mlp_cv = GridSearchCV(mlp , mlp_param_grid, cv=10)
#mlp_cv.fit(train_x_vector, y_train)


In [None]:
print(f'DT: {dt_cv.best_score_}, {dt_cv.best_params_}')
print(f'SVM: {sv_cv.best_score_}, {sv_cv.best_params_}')
print(f'GNB: {nb_accuracy_train}')
print(f'LG: {lg_cv.best_score_}, {lg_cv.best_params_}')
print(f'KNN: {knn_cv.best_score_}, {knn_cv.best_params_}')
#print(f'MLP: {mlp_cv.best_score_}, {mlp_cv.best_params_}')

In [None]:
# using the best model
modelo = KNeighborsClassifier(n_jobs = -1, metric = 'manhattan', n_neighbors = 1)
modelo.fit(train_x_vector,y_train)

KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=1)

In [None]:
classes = np.unique(Y)
classes

In [None]:
# create a confusion matrix
cm=confusion_matrix(y_test, modelo.predict(test_x_vector))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap=plt.cm.Reds)

In [None]:
# display the main performance indicators: precision, recall, f1-score, support, accuracy
print(classification_report(y_test, modelo.predict(test_x_vector),
                      labels = classes))

In [None]:
# train with 100% of the data
modelo = KNeighborsClassifier(n_jobs = -1, metric = 'manhattan', n_neighbors = 1)
modelo.fit(total_x_vector,y_total)

In [None]:
# label the rest of the dataset (performing the predictions)
df_sud['Modelo_sud'] = ''

for i in range(0, df_sud.shape[0]):
    if ((df_sud.Indice_palabra[i]) == -1):
        df_sud.Modelo_sud[i] = 2
    else:
        palabra = df_sud.Frase_extraida[i]
        df_sud.Modelo_sud[i] = modelo.predict(tfidf.transform([palabra]))

df_sud

In [None]:
# save DF to csv
df_sud.to_csv('Modelo_sud.csv', sep=';')

## Identification of the "discomfort symptom"

In [None]:
# read the data
df_vom = df3[["KeyAnonimo", 'analisis']]
df_vom

In [None]:
# search terms - key words (list of words - "dictionary")
df_vom['Indice_palabra'] = ''
df_vom['Frase_extraida'] = ''
palabra1 = "nausea"
palabra2 = "vomit"
palabra3 = "diarrea"

In [None]:
arraym = df_vom['analisis']
arraym

In [None]:
# performing the search...
for i in range(0, df_vom.shape[0]):
  if (arraym[i].find(palabra1)) != -1:
    indice = arraym[i].find(palabra1)
    df_vom.Indice_palabra[i] = indice
  elif (arraym[i].find(palabra2)) != -1:
    indice = arraym[i].find(palabra2)
    df_vom.Indice_palabra[i] = indice
  else:
    indice = arraym[i].find(palabra3)
    df_vom.Indice_palabra[i] = indice

In [None]:
# create a new column "Frase_extraida" that displays 150 characters of text around the index of the word found in the previous search.
for i in range(0, df_vom.shape[0]):
    if (((df_vom.Indice_palabra[i]) != -1) & ((df_vom.Indice_palabra[i]) > 150)):
       df_vom.Frase_extraida[i] = arraym[i][df_vom.Indice_palabra[i]-150:df_vom.Indice_palabra[i]+150]
    elif (((df_vom.Indice_palabra[i]) != -1) & ((df_vom.Indice_palabra[i]) <= 150)):
       df_vom.Frase_extraida[i] = arraym[i][df_vom.Indice_palabra[i]-df_vom.Indice_palabra[i]:df_vom.Indice_palabra[i]+150]
    else:
       df_vom.Frase_extraida[i] = "Sin información"

df_vom

In [None]:
# use a sample of the data for manual labeling by practicing physicians
#data_train_vom = df_vom['Frase_extraida'].sample(n=60000, random_state=7)
#data_train_vom.to_excel("RH_vom.xlsx")
#data_train_vom

In [None]:
# ML Model
# read the labeled data
df_etiquetas = pd.read_csv('C:/Users/diabetes.ml1/Downloads/Data_RiesgoHipoglicemia/Oraciones para etiquetar/8. RH_vom_CP_CSV.csv' ,sep=";", low_memory=False)
df_etiquetas = df_etiquetas[['Frase_extraida', 'Etiqueta']]
df_etiquetas

In [None]:
df_etiquetas.value_counts('Etiqueta')

In [None]:
# graph the proportion of classes as a pie chart
plt.pie(df_etiquetas['Etiqueta'].value_counts(),
        labels = ['0','1'], autopct='%.2f%%');

In [None]:
df = df_etiquetas.values
Y = df[:,1:].astype(int)

In [None]:
# create training and test partition: 80/20
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_etiquetas, test_size = 0.2, random_state = 10, stratify = df_etiquetas['Etiqueta'])

X_train, y_train = train['Frase_extraida'], train['Etiqueta']
X_test, y_test = test['Frase_extraida'], test['Etiqueta']
X_total, y_total = df_etiquetas['Frase_extraida'], df_etiquetas['Etiqueta']


print(np.unique(Y,return_counts=True))
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
print(np.unique(y_total,return_counts=True))

In [None]:
# vectorization to convert words to numbers for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

total_x_vector = tfidf.fit_transform(X_total)
train_x_vector = tfidf.transform(X_train)
test_x_vector = tfidf.transform(X_test)

In [None]:
# run the different classification models, using grid search to find the optimal hyperparameters
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#DecisionTree
dt_param_grid = {'criterion':['gini','entropy']}
dt = DecisionTreeClassifier(random_state=11)
dt_cv = GridSearchCV(dt, dt_param_grid, cv=10)
dt_cv.fit(train_x_vector, y_train)

#SVM
sv_param_grid = {'kernel':['rbf','linear', 'poly'],
              'C': [0.5,1,10,50,100],
              'gamma':[0.1, 'auto']}
sv = SVC(random_state=11)
sv_cv = GridSearchCV(sv , sv_param_grid, cv=10)
sv_cv.fit(train_x_vector, y_train)

#G_NaiveBayes
nb = GaussianNB()
nb_accuracy_train = (cross_val_score(nb, train_x_vector.toarray(), y_train, cv=10)).mean()

#Regresión logistica
lg_param_grid = {'C':[1,10,100],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[7000]}
lg = LogisticRegression(random_state=11)
lg_cv= GridSearchCV(lg, lg_param_grid, cv=10)
lg_cv.fit(train_x_vector, y_train)

#KNN
knn_param_grid = {'n_neighbors':np.arange(1,21),
              'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
knn = KNeighborsClassifier(n_jobs = -1)
knn_cv= GridSearchCV(knn, knn_param_grid,cv=10)
knn_cv.fit(train_x_vector,y_train)

#MLP
#mlp_param_grid = {'hidden_layer_sizes':[10, 50],
#              'activation':['identity', 'logistic', 'tanh', 'relu'],
#              'solver':['lbfgs', 'sgd', 'adam']}
#mlp = MLPClassifier(alpha=1e-5, random_state = 11, max_iter=2000)
#mlp_cv = GridSearchCV(mlp , mlp_param_grid, cv=10)
#mlp_cv.fit(train_x_vector, y_train)


In [None]:
print(f'DT: {dt_cv.best_score_}, {dt_cv.best_params_}')
print(f'SVM: {sv_cv.best_score_}, {sv_cv.best_params_}')
print(f'GNB: {nb_accuracy_train}')
print(f'LG: {lg_cv.best_score_}, {lg_cv.best_params_}')
print(f'KNN: {knn_cv.best_score_}, {knn_cv.best_params_}')
#print(f'MLP: {mlp_cv.best_score_}, {mlp_cv.best_params_}')

In [None]:
# using the best model
modelo = LogisticRegression(random_state=11, C = 100, max_iter = 7000, solver = 'newton-cg')
modelo.fit(train_x_vector,y_train)

In [None]:
classes = np.unique(Y)
classes

In [None]:
# create a confusion matrix
cm=confusion_matrix(y_test, modelo.predict(test_x_vector))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
disp.plot(cmap=plt.cm.Reds)

In [None]:
# display the main performance indicators: precision, recall, f1-score, support, accuracy
print(classification_report(y_test, modelo.predict(test_x_vector),
                      labels = classes))

In [None]:
# train with 100% of the data
modelo = LogisticRegression(random_state=11, C = 100, max_iter = 7000, solver = 'newton-cg')
modelo.fit(total_x_vector,y_total)

In [None]:
# label the rest of the dataset (performing the predictions)
df_vom['Modelo_vom'] = ''

for i in range(0, df_vom.shape[0]):
    if ((df_vom.Indice_palabra[i]) == -1):
        df_vom.Modelo_vom[i] = 2
    else:
        palabra = df_vom.Frase_extraida[i]
        df_vom.Modelo_vom[i] = modelo.predict(tfidf.transform([palabra]))

df_vom

In [None]:
# save DF to csv
df_vom.to_csv('Modelo_vom.csv', sep=';')