In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

In [7]:
titanic_md = pd.read_csv("titanic_MD.csv")
titanic = pd.read_csv("titanic.csv")

In [19]:
titanic_md.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_mean,Age_median,SibSp_mean,SibSp_median,Parch_mean,Parch_median,Fare_mean,Fare_median
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",?,38.0,1.0,0.0,PC 17599,71.2833,C85,C,38.0,38.0,1.0,1.0,0.0,0.0,71.2833,71.2833
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S,35.0,35.0,1.0,1.0,0.0,0.0,53.1,53.1
2,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0.0,0.0,17463,51.8625,E46,S,54.0,54.0,0.0,0.0,0.0,0.0,51.8625,51.8625
3,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,,1.0,,PP 9549,16.7,G6,S,35.692532,35.5,1.0,1.0,0.461988,0.0,16.7,16.7
4,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,,0.0,113783,26.55,C103,S,58.0,58.0,0.461111,0.0,0.0,0.0,26.55,26.55


In [10]:
titanic_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183 entries, 0 to 182
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  183 non-null    int64  
 1   Survived     183 non-null    int64  
 2   Pclass       183 non-null    int64  
 3   Name         183 non-null    object 
 4   Sex          183 non-null    object 
 5   Age          158 non-null    float64
 6   SibSp        180 non-null    float64
 7   Parch        171 non-null    float64
 8   Ticket       183 non-null    object 
 9   Fare         175 non-null    float64
 10  Cabin        183 non-null    object 
 11  Embarked     171 non-null    object 
dtypes: float64(4), int64(3), object(5)
memory usage: 17.3+ KB


In [11]:
titanic_md.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,183.0,183.0,183.0,158.0,180.0,171.0,175.0
mean,455.36612,0.672131,1.191257,35.692532,0.461111,0.461988,78.959191
std,247.052476,0.470725,0.515187,15.640858,0.646122,0.753435,77.026328
min,2.0,0.0,1.0,0.92,0.0,0.0,0.0
25%,263.5,0.0,1.0,24.0,0.0,0.0,29.7
50%,457.0,1.0,1.0,35.5,0.0,0.0,56.9292
75%,676.0,1.0,1.0,48.0,1.0,1.0,90.5396
max,890.0,1.0,3.0,80.0,3.0,4.0,512.3292


In [12]:
missing_report = titanic_md.isnull().sum().to_frame(name='Missing Values')
print(missing_report)

             Missing Values
PassengerId               0
Survived                  0
Pclass                    0
Name                      0
Sex                       0
Age                      25
SibSp                     3
Parch                    12
Ticket                    0
Fare                      8
Cabin                     0
Embarked                 12


In [18]:
missing_columns = titanic_md.columns[titanic_md.isnull().any()].tolist()
missing_columns

['Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [20]:
predictor_columns = ["Pclass", "Fare", "SibSp", "Parch", "Age"]
for column in missing_columns:
    if column in predictor_columns:
        titanic_md[f"{column}_mean"] = titanic_md[column].fillna(titanic_md[column].mean())
        titanic_md[f"{column}_median"] = titanic_md[column].fillna(titanic_md[column].median())
        train = titanic_md.dropna(subset=[column])
        test = titanic_md[titanic_md[column].isnull()]

        if not train.empty and not test.empty:
            X_train = train[predictor_columns].fillna(titanic_md[predictor_columns].median())
            y_train = train[column]
            X_test = test[predictor_columns].fillna(titanic_md[predictor_columns].median())

            model = LinearRegression()
            model.fit(X_train, y_train)
            imputed_data.loc[test.index, f"{column}_regression"] = model.predict(X_test)

        knn_imputer = KNNImputer(n_neighbors=5)
        knn_result = knn_imputer.fit_transform(titanic_md[[column] + predictor_columns])
        imputed_data[f"{column}_knn"] = knn_result[:, 0]

    elif column == "Embarked":
        imputed_data[f"{column}_mode"] = titanic_md[column].fillna(titanic_md[column].mode()[0])

    print(f"Imputaciones completadas para columna: {column}")

Imputaciones completadas para columna: Age
Imputaciones completadas para columna: SibSp
Imputaciones completadas para columna: Parch
Imputaciones completadas para columna: Fare
Imputaciones completadas para columna: Embarked


In [21]:
numeric_columns = titanic_md.select_dtypes(include=["number"]).columns.tolist()
categorical_columns = [col for col in missing_columns if col not in numeric_columns]

In [22]:
results = {}

for column in numeric_columns:
    if column in titanic.columns:
        real_values = titanic[column]
        valid_indices = titanic[titanic[column].notnull()].index

        for method in ["mean", "median", "regression", "knn"]:
            method_column = f"{column}_{method}"
            if method_column in imputed_data.columns:
                imputed_values = imputed_data.loc[valid_indices, method_column]
                valid_rows = imputed_values.notnull()

                if valid_rows.sum() > 0:
                    mae = mean_absolute_error(
                        real_values.loc[valid_indices][valid_rows].astype(float),
                        imputed_values[valid_rows]
                    )
                    results[method_column] = mae
                    print(f"MAE for {column} ({method}): {mae}")

MAE for Age (mean): 1.58973991837864
MAE for Age (median): 1.592896174863388
MAE for Age (regression): 11.66
MAE for Age (knn): 1.546994535519126
MAE for SibSp (mean): 0.008409228901032179
MAE for SibSp (median): 0.01092896174863388
MAE for SibSp (regression): 0.6666666666666673
MAE for SibSp (knn): 0.008743169398907104
MAE for Parch (mean): 0.04371584699453552
MAE for Parch (median): 0.04371584699453552
MAE for Parch (regression): 0.6666666666666677
MAE for Parch (knn): 0.03825136612021858
MAE for Fare (mean): 1.8085244902419986
MAE for Fare (median): 1.780145355191257
MAE for Fare (regression): 40.72082499999999
MAE for Fare (knn): 1.4236062295081966


In [23]:
for column in categorical_columns:
    if column in titanic.columns:
        real_values = titanic[column]
        valid_indices = titanic[titanic[column].notnull()].index

        method_column = f"{column}_mode"
        if method_column in imputed_data.columns:
            imputed_values = imputed_data.loc[valid_indices, method_column]
            valid_rows = imputed_values.notnull()

            if valid_rows.sum() > 0:
                accuracy = (real_values.loc[valid_indices][valid_rows] == imputed_values[valid_rows]).mean()
                results[method_column] = accuracy
                print(f"Accuracy for {column} (mode): {accuracy}")

best_methods = {}
for col in missing_columns:
    methods_for_col = [k for k in results if k.startswith(col)]
    if methods_for_col:
        best_method = min(methods_for_col, key=results.get if col in numeric_columns else lambda x: -results[x])
        best_methods[col] = best_method
    else:
        best_methods[col] = "No methods available"

print("Mejores métodos para cada columna:")
print(best_methods)

Accuracy for Embarked (mode): 0.9672131147540983
Mejores métodos para cada columna:
{'Age': 'Age_knn', 'SibSp': 'SibSp_mean', 'Parch': 'Parch_knn', 'Fare': 'Fare_knn', 'Embarked': 'Embarked_mode'}


In [24]:
output_file = "results.md"

with open(output_file, "w") as md_file:
    md_file.write("#Results\n")
    md_file.write("\n## Mejor metodo para cada columna\n")
    for col, method in best_methods.items():
        md_file.write(f"- **{col}**: {method}\n")

    md_file.write("\n## Error MAE para cada metodo por columna\n")
    for method, metric in results.items():
        md_file.write(f"- **{method}**: {metric:.4f}\n")

Results exported to results.md
