In [95]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')
# Importar bibliotecas necesarias
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


df = pd.read_csv("titanic.csv")

# Crear una copia del dataframe original
df_original = df.copy()
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


--------------------PREPARAMOS LOS DATOS-----------------------------

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


----TRATAMOS NULLS

In [97]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Cabin'] = df['Cabin'].fillna('Unknown')

In [98]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [99]:
#Tratamos variables nulls de edad
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


---------------------INGENIERIA DE CARACTERISTICAS--------------------

In [100]:
#Crear características
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df['FarePerPerson'] = df['Fare'] / df['FamilySize']
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

# Binning de Age y Fare
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 25, 40, 80], labels=['Child', 'Young', 'Adult', 'Senior'], right=False)
df['FareGroup'] = pd.qcut(df['Fare'], q=4, labels=['Low', 'Medium', 'High', 'VeryHigh'])

# Extraer Deck de Cabin
df['Deck'] = df['Cabin'].str[0]

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   PassengerId    891 non-null    int64   
 1   Survived       891 non-null    int64   
 2   Pclass         891 non-null    int64   
 3   Name           891 non-null    object  
 4   Sex            891 non-null    object  
 5   Age            714 non-null    float64 
 6   SibSp          891 non-null    int64   
 7   Parch          891 non-null    int64   
 8   Ticket         891 non-null    object  
 9   Fare           891 non-null    float64 
 10  Cabin          891 non-null    object  
 11  Embarked       891 non-null    object  
 12  FamilySize     891 non-null    int64   
 13  IsAlone        891 non-null    int64   
 14  FarePerPerson  891 non-null    float64 
 15  Title          891 non-null    object  
 16  AgeGroup       713 non-null    category
 17  FareGroup      891 non-null    cate

------------------------------COMVERTIMOS TEXTO A NUMERO-----------------------

In [102]:
# Primera codificación (todas las columnas, no guardado)
categorical_cols_all = ['Sex', 'Embarked', 'Cabin', 'Title', 'Ticket', 'AgeGroup', 'FareGroup', 'Deck']
encoder_all = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_all_encoded = df.copy()
df_all_encoded[categorical_cols_all] = encoder_all.fit_transform(df[categorical_cols_all])


df_all_encoded.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,FarePerPerson,Title,AgeGroup,FareGroup,Deck
0,1,0,3,"Braund, Mr. Owen Harris",1.0,22.0,1,0,523.0,7.25,147.0,2.0,2,0,3.625,2.0,3.0,1.0,8.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1,0,596.0,71.2833,81.0,0.0,2,0,35.64165,3.0,0.0,3.0,2.0
2,3,1,3,"Heikkinen, Miss. Laina",0.0,26.0,0,0,669.0,7.925,147.0,2.0,1,1,7.925,1.0,0.0,2.0,8.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1,0,49.0,53.1,55.0,2.0,2,0,26.55,3.0,0.0,3.0,2.0
4,5,0,3,"Allen, Mr. William Henry",1.0,35.0,0,0,472.0,8.05,147.0,2.0,1,1,8.05,2.0,0.0,2.0,8.0


----------------------PREDECIMOS EDAD ----------------------

In [78]:
# Predecir valores faltantes de Age
features_age = ['Pclass','Sex', 'SibSp', 'Parch', 'FamilySize', 'Title']
X_age = df_all_encoded[features_age].copy()
y_age = df_all_encoded['Age'].copy()



X_train_age = X_age[~y_age.isna()]
y_train_age = y_age[~y_age.isna()]
X_test_age = X_age[y_age.isna()]

model_age = LinearRegression()
model_age.fit(X_train_age, y_train_age)
predicted_ages = model_age.predict(X_test_age)
df_all_encoded.loc[y_age.isna(), 'Age'] = predicted_ages
df_all_encoded.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
FamilySize     0
Title          0
dtype: int64

------------------------ESCALAMOS LSO DATOS ---------------

In [79]:
# Escalar todas las características con RobustScaler (no guardado)
X_all = df_all_encoded.drop(['Survived', 'PassengerId', 'Name'], axis=1)
y = df['Survived'].copy()
scaler_all = RobustScaler()
X_all_scaled = pd.DataFrame(scaler_all.fit_transform(X_all), columns=X_all.columns)
X_all_scaled.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Title
0,0.0,0.0,-0.452594,1.0,0.0,0.515235,-0.312011,0.0,0.0,1.0,0.0
1,-2.0,-1.0,0.563279,1.0,0.0,0.717452,2.461242,-66.0,-2.0,1.0,1.0
2,0.0,-1.0,-0.198626,0.0,0.0,0.919668,-0.282777,0.0,0.0,0.0,-1.0
3,-2.0,-1.0,0.372802,1.0,0.0,-0.797784,1.673732,-92.0,0.0,1.0,1.0
4,0.0,0.0,0.372802,0.0,0.0,0.373961,-0.277363,0.0,0.0,0.0,0.0


---------------------SELECCIONAMOS VARIABLES----------------

-----

In [80]:
# Calcular importancia de características con DecisionTreeClassifier
clfDT = DecisionTreeClassifier(random_state=42)
clfDT.fit(X_all_scaled, y)
feature_importances = pd.DataFrame({'feature': X_all_scaled.columns, 'importance': clfDT.feature_importances_})
sorted_features = feature_importances.sort_values(by='importance', ascending=False)
print("\nImportancia de Características desde DecisionTreeClassifier:")
print(sorted_features)


Importancia de Características desde DecisionTreeClassifier:
       feature  importance
1          Sex    0.296168
5       Ticket    0.202745
2          Age    0.151688
6         Fare    0.117157
0       Pclass    0.074177
7        Cabin    0.070706
3        SibSp    0.037704
10       Title    0.023429
4        Parch    0.017258
9   FamilySize    0.007795
8     Embarked    0.001172


In [81]:
# Definir características finales
features = ['Sex', 'Ticket', 'Age', 'Fare', 'Pclass', 'SibSp', 'Title', 'Parch']
X_selected = X_all_scaled[features].copy()
X_selected.head()

Unnamed: 0,Sex,Ticket,Age,Fare,Pclass,SibSp,Title,Parch
0,0.0,0.515235,-0.452594,-0.312011,0.0,1.0,0.0,0.0
1,-1.0,0.717452,0.563279,2.461242,-2.0,1.0,1.0,0.0
2,-1.0,0.919668,-0.198626,-0.282777,0.0,0.0,-1.0,0.0
3,-1.0,-0.797784,0.372802,1.673732,-2.0,1.0,1.0,0.0
4,0.0,0.373961,0.372802,-0.277363,0.0,0.0,0.0,0.0


--------------------------------------------APLICAMOS PCA-----------------------

In [82]:
# Aplicar PCA
pca = PCA(n_components=0.99)  # Conservar el 99% de la varianza
X_pca = pca.fit_transform(X_selected)

principalDf = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2','PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8'])
principalDf = pd.concat([principalDf, df[['Survived']]], axis=1)


In [83]:
principalDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PC1       891 non-null    float64
 1   PC2       891 non-null    float64
 2   PC3       891 non-null    float64
 3   PC4       891 non-null    float64
 4   PC5       891 non-null    float64
 5   PC6       891 non-null    float64
 6   PC7       891 non-null    float64
 7   PC8       891 non-null    float64
 8   Survived  891 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 62.8 KB


In [84]:
principalDf.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,Survived
0,-1.227033,0.615686,-0.028443,-0.276776,0.764836,-0.09609,0.059947,0.265388,0
1,1.983445,-0.697178,0.997209,-0.435182,0.363331,-0.271419,1.258386,0.098704,1
2,-1.245126,0.213934,-1.058968,0.120301,0.318858,0.609587,0.799688,-0.511428,1
3,1.231595,-0.706479,1.183217,-0.678508,-0.579808,-0.942921,0.363166,-0.290365,1
4,-1.251106,-0.491279,-0.194276,0.122601,0.643176,0.31191,-0.153407,0.014987,0


---------------------------CODIFICAMOS Y ESCALAMOS-----------------

In [85]:
# Segunda codificación y escalado (solo características finales, guardado)
# Codificar solo las columnas categóricas relevantes
categorical_cols_final = ['Sex', 'Title', 'Ticket']
encoder_final = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_final = df.copy()
df_final[categorical_cols_final] = encoder_final.fit_transform(df[categorical_cols_final])

# Actualizar Age en df_final con los valores predichos
df_final.loc[y_age.isna(), 'Age'] = predicted_ages

# Guardar el codificador final
joblib.dump(encoder_final, 'ordinal_encoder.pkl')
print("Codificador ordinal final guardado como ordinal_encoder.pkl")

# Preparar características finales para escalado
X_final = df_final[features].copy()

# Escalar características finales con RobustScaler
scaler_final = RobustScaler()
X_final_scaled = pd.DataFrame(scaler_final.fit_transform(X_final), columns=X_final.columns)

# Guardar el escalador final
joblib.dump(scaler_final, 'robust_scaler.pkl')
print("Escalador robusto final guardado como robust_scaler.pkl")

Codificador ordinal final guardado como ordinal_encoder.pkl
Escalador robusto final guardado como robust_scaler.pkl


--------------------------------------PCA DEL MODELO GUARDALO--------------

In [86]:
# Guardar el modelo PCA
joblib.dump(pca, 'pca_model.pkl')
print("Modelo PCA guardado como pca_model.pkl")

Modelo PCA guardado como pca_model.pkl


-------------------MODELAMOS LSO REUSLTADOS Y REALIZAMO PRUEBAS----------------

In [87]:
# Evaluar el modelo usando validación cruzada
best_model = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
    max_depth=14,
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=100
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X_pca, y, cv=cv, scoring='accuracy')

print("\nResultados de Validación Cruzada:")
print(f"Precisión Media: {np.mean(cv_scores):.4f}")
print(f"Desviación Estándar: {np.std(cv_scores):.4f}")
print(f"Puntuaciones: {cv_scores}")


Resultados de Validación Cruzada:
Precisión Media: 0.8305
Desviación Estándar: 0.0186
Puntuaciones: [0.82681564 0.80337079 0.82022472 0.85393258 0.84831461]


-----------------ENTRENAMIENTO Y EVALUACION------------

In [88]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)

# Entrenar el modelo en el conjunto de entrenamiento
best_model.fit(X_train, y_train)

# Evaluar el modelo en el conjunto de prueba
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("\nPrecisión en el conjunto de prueba:", test_accuracy)

# Guardar el modelo RandomForest
joblib.dump(best_model, 'random_forest_model.pkl')
print("Modelo RandomForest guardado como random_forest_model.pkl")

# Imprimir resultados finales
print("\nResultados del Modelo RandomForest:")
print(f"Precisión Media (Validación Cruzada): {np.mean(cv_scores):.4f}")
print(f"Desviación Estándar (Validación Cruzada): {np.std(cv_scores):.4f}")
print(f"Puntuaciones de Validación Cruzada: {cv_scores}")


Precisión en el conjunto de prueba: 0.8324022346368715
Modelo RandomForest guardado como random_forest_model.pkl

Resultados del Modelo RandomForest:
Precisión Media (Validación Cruzada): 0.8305
Desviación Estándar (Validación Cruzada): 0.0186
Puntuaciones de Validación Cruzada: [0.82681564 0.80337079 0.82022472 0.85393258 0.84831461]


In [89]:
# Nuevo train-test split con datos originales (excluyendo Embarked y Cabin)
features_original = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Title', 'FamilySize']
df_original['Title'] = df_original['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
df_original['Title'] = df_original['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df_original['Title'] = df_original['Title'].replace(['Mlle', 'Ms'], 'Miss')
df_original['Title'] = df_original['Title'].replace('Mme', 'Mrs')
df_original['FamilySize'] = df_original['SibSp'] + df_original['Parch'] + 1
df_original.loc[y_age.isna(), 'Age'] = predicted_ages

# Codificar variables categóricas en df_original
categorical_cols_orig = ['Sex', 'Title', 'Ticket']
df_original[categorical_cols_orig] = encoder_final.transform(df_original[categorical_cols_orig])

# Escalar características
X_original = df_final[features].copy()  # Use same features as scaler_final was fitted with
X_original_scaled = pd.DataFrame(scaler_final.transform(X_original), columns=X_original.columns)

# Aplicar PCA
X_original_pca = pca.transform(X_original_scaled)

# Nuevo train-test split
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X_original_pca, 
    df_original['Survived'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df_original['Survived']
)

# Predecir con el modelo entrenado
y_pred_orig = best_model.predict(X_test_orig)

# Crear DataFrame con valores originales y predicciones
test_indices = y_test_orig.index
test_results = df_original.loc[test_indices, features_original].copy()
test_results['Survived_Original'] = y_test_orig.values
test_results['Survived_Predicted'] = y_pred_orig

# Guardar resultados
test_results.to_csv('test_predictions.csv', index=False)
print("\nResultados de predicción guardados en 'test_predictions.csv'")
print("\nPrimeras filas de los resultados de prueba:")
print(test_results.head())


Resultados de predicción guardados en 'test_predictions.csv'

Primeras filas de los resultados de prueba:
     Pclass                               Name  Sex        Age  SibSp  Parch  \
565       3               Davies, Mr. Alfred J  1.0  24.000000      2      0   
160       3           Cribb, Mr. John Hatfield  1.0  44.000000      0      1   
553       3  Leeni, Mr. Fahim ("Philip Zenni")  1.0  22.000000      0      0   
860       3            Hansen, Mr. Claus Peter  1.0  41.000000      2      0   
241       3     Murphy, Miss. Katherine "Kate"  0.0  16.213431      1      0   

     Ticket     Fare  Title  FamilySize  Survived_Original  Survived_Predicted  
565   519.0  24.1500    2.0           3                  0                   0  
160   470.0  16.1000    2.0           2                  0                   0  
553   171.0   7.2250    2.0           1                  1                   0  
860   399.0  14.1083    2.0           3                  0                   0  
241   4