In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Leer los datos de prueba y entrenamiento
df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('train.csv')

## Ver encabezados
print(df_test.head())
print(df_train.head())

## Verificar la cantidad de datos
print('Cantidad de datos:')
print(df_test.shape)
print(df_train.shape)

## Verificar el tipo de datos
print('Tipos de datos:')
print(df_test.info())
print(df_train.info())

## Verificar los datos faltantes
print('Datos faltantes:')
print(df_test.isnull().sum())
print(df_train.isnull().sum())

## Verificar las estadísticas del dataset
print('Estadisticas del dataset:')
print(df_test.describe())
print(df_train.describe())

## PROCESAMIENTO DE LA DATA ##

# Cambiar los datos necesarios a números
df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male': 1})
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1})

df_test['Embarked'] = df_test['Embarked'].map({'Q': 0, 'S': 1, 'C': 2})
df_train['Embarked'] = df_train['Embarked'].map({'Q': 0, 'S': 1, 'C': 2})

# Reemplazar los datos faltantes por la media
age_mean = df_train['Age'].mean()
df_test['Age'].fillna(age_mean, inplace=True)
df_train['Age'].fillna(age_mean, inplace=True)

# Crear varios grupos de acuerdo a rango de edades
# Rangos: 0-8, 9-15, 16-18, 19-25, 26-40, 41-60, 61-100
bins = [0, 8, 15, 18, 25, 40, 60, 100]
labels = ['1', '2', '3', '4', '5', '6', '7']
df_test['Age'] = pd.cut(df_test['Age'], bins=bins, labels=labels)
df_train['Age'] = pd.cut(df_train['Age'], bins=bins, labels=labels)

# Eliminar columnas innecesarias
df_test.drop(['Cabin'], axis=1, inplace=True)
df_train.drop(['Cabin'], axis=1, inplace=True)

df_test.drop(['Name', 'Ticket'], axis=1, inplace=True)
df_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

# Eliminar filas con datos faltantes
df_test.dropna(axis=0, how='any', inplace=True)
df_train.dropna(axis=0, how='any', inplace=True)

# Verificación de los datos
print(df_test.isnull().sum())
print(df_train.isnull().sum())

print(df_test.shape)
print(df_train.shape)

print(df_test.head())
print(df_train.head())

## APLICACIÓN DE ALGORITMOS DE MACHINE LEARNING ##

# Separar la columna con la información de los sobrevivientes
X = df_train.drop(columns=['Survived']).values
y = df_train['Survived'].values

# Dividir los datos de "train" en entrenamiento y prueba para probar los algoritmos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Regresión logística
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
accuracy_logreg = logreg.score(X_train, y_train)
print('Precisión Regresión Logística:')
print(accuracy_logreg)

# Support Vector Machines
svc = SVC()
svc.fit(X_train, y_train)
accuracy_svc = svc.score(X_train, y_train)
print('Precisión Soporte de Vectores:')
print(accuracy_svc)

# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
accuracy_knn = knn.score(X_train, y_train)
print('Precisión vecinos más cercanos:')
print(accuracy_knn)


## PREDICCIÓN UTILIZANDO LOS MODELOS ##

ids = df_test['PassengerId']

# Regresión logística
prediccion_logreg = logreg.predict(df_test.drop('PassengerId', axis=1).values)
out_logreg = pd.DataFrame({'PassengerId': ids, 'Survived': prediccion_logreg})
print('Predicción Regresión Logística:')
print(out_logreg.head())

# Support Vector Machines
prediccion_svc = svc.predict(df_test.drop('PassengerId', axis=1).values)
out_svc = pd.DataFrame({'PassengerId': ids, 'Survived': prediccion_svc})
print('Predicción Soporte de Vectores:')
print(out_svc.head())

# KNN
prediccion_knn = knn.predict(df_test.drop('PassengerId', axis=1).values)
out_knn = pd.DataFrame({'PassengerId': ids, 'Survived': prediccion_knn})
print('Predicción Vecinos Más Cercanos:')
print(out_knn.head())


   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1 

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
