In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import io
import matplotlib.pyplot as plt
import matplotlib as mpl
import xgboost as xgb
# Going to use these 4 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import regularizers

from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
plt.rcParams["figure.figsize"] = [20,5]

In [2]:

def clean_data(file_path):
    scaler = MinMaxScaler()
    df = pd.read_csv(file_path)
    passengerId = df['PassengerId']
    df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})
    df['isAlone'] = np.where((df['SibSp'] == 0) & (df['Parch'] == 0) , 1, 0)
    df['Familysize'] = df['SibSp'] + df['Parch'] + 1
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    #df['Age'] = pd.cut(df['Age'], bins=10, labels = [1,2,3,4,5,6,7,8,9,10]).astype('int')
    #df['Age'] = df['Age'].astype('int').apply(bin_ages)
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df.loc[~df['Cabin'].isnull(), 'Cabin'] = 1
    df.loc[df['Cabin'].isnull(), 'Cabin'] = 0
    df['Cabin'] = df['Cabin'].astype('int')
    df['Embarked'] = df['Embarked'].fillna('N')
    df['Embarked'] = df['Embarked'].map({'S': 0, 'Q': 1, 'C': 2, 'N': 3})
    #df['Fare'] = df['Fare'].apply(bin_fares)
    #df['Fare'] = pd.cut(df['Fare'], bins=10, labels = [1,2,3,4,5,6,7,8,9,10]).astype('int')
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df.drop(['Name','Ticket'], axis=1, inplace=True)
    df = pd.concat([df, pd.get_dummies(df['Pclass'], prefix='Pclass')], axis=1)
    df.drop('Pclass', axis=1, inplace=True)
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    df_scaled['PassengerId'] = passengerId
    return df_scaled

In [3]:
train = clean_data('../input/train.csv')
train.drop(['PassengerId'], axis=1, inplace=True)
train.head(10)

In [4]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
# Separating out the features
x = train.drop(['Survived'], axis=1).values
# Separating out the target
y = train['Survived'].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3'])

finalDf = pd.concat([principalDf, train['Survived']], axis = 1)

fig = plt.figure()
ax = fig.add_subplot(1,1,1, projection='3d') 
ax.set_xlabel('PC 1', fontsize = 15)
ax.set_ylabel('PC 2', fontsize = 15)
ax.set_zlabel('PC 3', fontsize = 15)
ax.set_title('3 component PCA', fontsize = 20)
targets = [1, 0]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['Survived'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , finalDf.loc[indicesToKeep, 'principal component 3']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [5]:
pca.explained_variance_ratio_

In [6]:
import matplotlib.mlab as mlab

train_raw = pd.read_csv('../input/train.csv')
mpl.style.use('bmh')
plt.subplot(1,2,1)
plt.xlabel('Edad raw')
plt.axvline(train_raw['Age'].quantile(.5), c='r')
plt.hist(train_raw['Age'].dropna())
plt.subplot(1,2,2)
plt.axvline(train['Age'].quantile(.5), c='r')
plt.hist(train['Age'])
plt.xlabel('Edad parsed');

In [7]:
train_raw = pd.read_csv('../input/train.csv')
train_raw
plt.subplot(1,2,1)
plt.xlabel('Fare raw')
plt.axvline(train_raw['Fare'].quantile(.5), c='r')
plt.hist(train_raw['Fare'].dropna())
plt.subplot(1,2,2)
plt.axvline(train['Fare'].quantile(.5), c='r')
plt.hist(train['Fare'])
plt.xlabel('Fare parsed')

In [8]:
test = clean_data('../input/test.csv')
test_passenger_id = test['PassengerId']
test.drop('PassengerId', inplace=True, axis=1)

In [9]:
def create_model():
    model = Sequential()
    model.add(Dense(5, input_dim=12, activation='relu', kernel_regularizer=regularizers.l2(0.0001)))
    model.add(Dense(5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
    return model
model = create_model()


In [10]:
%time history = model.fit(train.drop(['Survived'], axis=1), train['Survived'], epochs=7000, batch_size=100, verbose=0)

In [11]:
plt.plot(range(1,len(history.history['loss'])+1), history.history['loss'], color='r')

In [12]:
print('final loss {}'.format(history.history['loss'][-1]))

In [13]:
my_submission = pd.DataFrame({'PassengerId': test_passenger_id})
y_test = model.predict_classes(test)
y_test = y_test.reshape(len(y_test),)
my_submission['Survived'] = y_test
my_submission[['PassengerId', 'Survived']].to_csv('submission.csv', index=False)
my_submission[['PassengerId', 'Survived']].head(10)