In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# KNN REPAIR:

In [2]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

# Codificar variables categóricas
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Seleccionar las características de interés
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features].fillna(0)

In [3]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
imputer = KNNImputer(n_neighbors=5)


In [5]:
X_imputed = imputer.fit_transform(X)


In [6]:
data[features] = X_imputed


In [7]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

# KNN TESTING:

In [8]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [9]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

# Codificar variables categóricas
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Seleccionar las características y la variable objetivo
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'
X = data[features]
y = data[target]

In [10]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)


In [11]:
neighbors_list = [3, 5, 7, 9, 11]
best_neighbors = 0
best_score = -np.inf

for neighbors in neighbors_list:
    # Crear el pipeline con el KNNImputer, StandardScaler y el clasificador
    imputer = KNNImputer(n_neighbors=neighbors)
    scaler = StandardScaler()
    classifier = LogisticRegression(max_iter=1000)
    pipeline = Pipeline(steps=[('imputer', imputer), ('scaler', scaler), ('classifier', classifier)])

    # Evaluar el pipeline con validación cruzada
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
    mean_score = np.mean(scores)

    print(f"Neighbors: {neighbors}, Mean Accuracy: {mean_score}")

    # Actualizar el mejor valor de n_neighbors si es necesario
    if mean_score > best_score:
        best_score = mean_score
        best_neighbors = neighbors

print(f"Best number of neighbors: {best_neighbors}, Best accuracy: {best_score}")

Neighbors: 3, Mean Accuracy: 0.800207143305505
Neighbors: 5, Mean Accuracy: 0.7990898248697509
Neighbors: 7, Mean Accuracy: 0.8058313979034585
Neighbors: 9, Mean Accuracy: 0.8058251208335949
Neighbors: 11, Mean Accuracy: 0.8047015253279769
Best number of neighbors: 7, Best accuracy: 0.8058313979034585
