In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline 
from sklearn.linear_model import LogisticRegression


In [78]:
df_train = pd.read_csv("train.csv")
print("Shape of dataframe: ",df_train.shape)
df_train.head()

Shape of dataframe:  (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<h1>Breve análisis</h1>
<p>Debido a lo conocido en la historia de esta tragedia, se sabe que las personas con mayor clase social tenian preferencia para tomar los botes emergencia. Ademas se dia preferencia a la evacuación de mujeres y niños, por lo que podemos tomar como nuestras principales caracteristicas a analizar las siguientes</p>
<ul>
    <li>Pclass</li>
    <li>Sex</li>
    <li>Age</li>
</ul>
<p>Apesar de ser las principales caracteristicas, existieron personas que no encajaron en este molde, por ejemplo, los hombres a carga de los botes de emergencia. Por lo tanto es importante analizar de manera mas profunda cada una de las caracteristicas</p>

In [10]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
df_train.drop("Cabin",axis =1,inplace =True)

In [17]:
print("Clases de SibSp: ", df_train["SibSp"].unique())
print("Clases de Parch: ",df_train["Parch"].unique())

Clases de SibSp:  [1 0 3 4 2 5 8]
Clases de Parch:  [0 1 2 5 3 4 6]


<h1>Preparacion de datos</h1>

In [79]:
dummies_class = pd.get_dummies(df_train["Pclass"], drop_first=True)
dummies_sex = pd.get_dummies(df_train["Sex"], drop_first=True)
X = pd.concat([dummies_class,dummies_sex, df_train["Age"]], axis=1)

In [80]:
X.head()

Unnamed: 0,2,3,male,Age
0,0,1,1,22.0
1,0,0,0,38.0
2,0,1,0,26.0
3,0,0,0,35.0
4,0,1,1,35.0


In [81]:
Y  = df_train["Survived"]

In [47]:
print(df_train["Age"].values.reshape(-1,1).shape)
print(df_train["Age"].shape)

(891, 1)
(891,)


In [54]:
X["Age"] = df_train["Age"]
X.isnull().sum()

2       0
3       0
male    0
Age     0
dtype: int64

<h1>Train model</h1>

In [82]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0 )

In [83]:
pipe = make_pipeline(Imputer(missing_values='NaN',strategy='mean', axis=0), StandardScaler(), PCA(),LogisticRegression())
pipe.fit(x_train, y_train)
pipe.score(x_test,y_test)



0.7985074626865671

<h1>Test de kaggle</h1>

In [86]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [87]:
dummies_class = pd.get_dummies(df_test["Pclass"], drop_first=True)
dummies_sex = pd.get_dummies(df_test["Sex"], drop_first=True)
X = pd.concat([dummies_class,dummies_sex, df_test["Age"]], axis=1)


In [88]:
yhat = pipe.predict(X)

In [89]:
df_send = pd.read_csv('gender_submission.csv')
df_send.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [92]:
df_send["Survived"] = yhat

In [94]:
df_send.to_csv('my_gender_submission.csv', index =False)

<p> Para leer sobre validacion cruzada y grid search ve a la pagina : <a href="https://stackabuse.com/cross-validation-and-grid-search-for-model-selection-in-python/">Aqui</a></p>