In [None]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn import svm
from keras.optimizers import SGD, Adam
from keras.optimizers import RMSprop
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', 50)
np.set_printoptions(threshold=np.inf)

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold

# Load in the train and test datasets
train = pd.read_csv('C:/Users/tenki/Desktop/kaggleデータセット/titanic/train.csv')
test = pd.read_csv('C:/Users/tenki/Desktop/kaggleデータセット/titanic/test.csv')
data=train
# Store our passenger ID for easy access
PassengerId = test['PassengerId']

# print(train.head(3))

# data
full_data = [train, test]

# Some features of my own that I have added in
# Gives the length of the name
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)
# Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Feature engineering steps taken from Sina
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# Create a New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)


# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)

    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

    # Mapping Fare
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

    # Mapping Age
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age'] = 4;


# Feature selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)
# print(train.head(3))

colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0,square=True, cmap=colormap, linecolor='white', annot=True)
# plt.show()

g = sns.pairplot(train[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked',
       u'FamilySize', u'Title']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
g.set(xticklabels=[])
# plt.show()


# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test.values # Creats an array of the test data

x_train_tr=x_train[0:601]
x_train_te=x_train[601:891]
y_train_tr=y_train[0:601]
y_train_te=y_train[601:891]

# ニューラルネットワーク構築
model = Sequential()
model.add(Dense(20, input_dim = 11, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, nb_epoch=1000,batch_size=100, validation_split=0.2)
pred_tr=model.predict(x_train_tr)
pred_te=model.predict(x_train_te)
pred_test=model.predict(x_test)
[loss,accuracy_score] = model.evaluate(x_train_te, y_train_te)
print("loss:{0} -- accuracy:{1}".format(loss,accuracy_score))

p = [1 if i > 0.5 else 0 for i in pred_te]
print('Accuracy at NN:n',sum(p == y_train_te) / 290.0)

# svm
classifier = svm.SVC(C=1.0, gamma=0.01)
classifier.fit(pred_tr, y_train_tr)
predicted = classifier.predict(pred_te)
print('Accuracy at NN+svm1:n', metrics.accuracy_score(y_train_te, predicted))

param_grid = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]

clf = GridSearchCV(SVC(C=1), param_grid, cv=5, n_jobs=-1)  # n_jobs: 並列計算を行う（-1 とすれば使用PCで可能な最適数の並列処理を行う）
clf.fit(x_train, y_train)
# print(clf.best_estimator_)  # 最適なパラメータを表示

pred=clf.predict(x_test)
# print(classification_report(y_train_te, pred))  # クラスタリング結果を表示
# print(confusion_matrix(y_train_te, pred))  # クラスタリング結果を表示
# print('Accuracy at NN+svm2:n', metrics.accuracy_score(y_train_te, pred))

# output_df
test_pred=model.predict(x_test)
test_pred = [1 if i > 0.5 else 0 for i in test_pred]

output_df = pd.DataFrame(test_pred, columns=['Survived'])
output_df['PassengerId'] = PassengerId
output_df = output_df.ix[:, ['PassengerId', 'Survived']]
output_df.to_csv('C:/Users/tenki/Desktop/kaggleデータセット/titanic/test_ans_nn.csv', index=False, encoding='utf-8')

output_df = pd.DataFrame(pred, columns=['Survived'])
output_df['PassengerId'] = PassengerId
output_df = output_df.ix[:, ['PassengerId', 'Survived']]
output_df.to_csv('C:/Users/tenki/Desktop/kaggleデータセット/titanic/test_ans_svm.csv', index=False, encoding='utf-8')