In [None]:
# analysis.ipynb

# Importar las funciones del archivo functions.py
from functions import *

# 1. Importar datos
filepath = "C:/Users/Carlos/Downloads/dia.csv"
df = load_data(filepath)

In [None]:
# 2. Limpieza de datos
df = remove_duplicates(df)
count_distinct_values(df)
count_null_values(df)
df = remove_unnecessary_values(df)

In [None]:
# 3. Análisis univariado
plot_histogram(df, 'age')
plot_countplot(df, 'gender')
plot_distplot(df, 'bmi')

for col in ['hypertension', 'heart_disease', 'diabetes']:
    plot_countplot(df, col)

plot_countplot(df, 'smoking_history')

In [None]:
# 4. Análisis bivariado
plot_boxplot(df, 'diabetes', 'bmi')
plot_boxplot(df, 'diabetes', 'age')
plot_countplot(df, 'gender')
plot_boxplot(df, 'diabetes', 'HbA1c_level')
plot_boxplot(df, 'diabetes', 'blood_glucose_level')
plot_pairplot(df, 'diabetes')
plot_scatterplot(df, 'age', 'bmi', 'diabetes')
plot_violinplot(df, 'diabetes', 'bmi', 'gender')
plot_boxplot(df, 'diabetes', 'bmi')
plot_boxplot(df, 'diabetes', 'age')

In [None]:

# 5. Correlaciones
df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)
data = df.copy()
data = perform_one_hot_encoding(data, 'gender')
data = perform_one_hot_encoding(data, 'smoking_history')
correlation_matrix = data.corr()
plot_heatmap(correlation_matrix, "Correlation Matrix Heatmap")



In [None]:
# 6. Modelos de Machine Learning
# KNN
X = data.drop('diabetes', axis=1)
y = data['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
knn = train_knn(X_train, y_train)
evaluate_model(knn, X_test, y_test)

# Logistic Regression
X = df[["age", "hypertension", "heart_disease", "bmi", "HbA1c_level", "blood_glucose_level"]]
y = df['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
log = train_logistic_regression(X_train, y_train)
evaluate_model(log, X_test, y_test)

# Random Forest
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}
grid_search = train_random_forest(X_train, y_train, param_grid)
evaluate_model(grid_search, X_test, y_test)

# Importancia de características
onehot_columns = list(grid_search.best_estimator_.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(['gender', 'smoking_history']))
feature_names = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'hypertension', 'heart_disease'] + onehot_columns
plot_feature_importances(grid_search, feature_names)