#### Data Dictionary

* Survival	0 = No, 1 = Yes
* pclass =>	Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
* sex = Sex
* Age => Age in years
* sibsp => # of siblings / spouses aboard the Titanic
* parch => # of parents / children aboard the Titanic
* ticket => Ticket number	
* fare => Passenger fare	
* cabin => Cabin number
* embarked => Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)



#### Variable Notes
#### pclass: A proxy for socio-economic status (SES)
* 1st = Upper
* 2nd = Middle
* 3rd = Lower



#### age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5



#### sibsp: The dataset defines family relations in this way...
* Sibling = brother, sister, stepbrother, stepsister
* Spouse = husband, wife (mistresses and fianc閟 were ignored)



#### parch: The dataset defines family relations in this way...
* Parent = mother, father
* Child = daughter, son, stepdaughter, stepson
* Some children travelled only with a nanny, therefore parch=0 for them.




In [None]:
# read csv files

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# -- Feature Engineering --

In [None]:
df = pd.read_csv('../input/train.csv')

df.head()

In [None]:
test_df = pd.read_csv('../input/test.csv')
test_df.head()

In [None]:
surv_col = df.iloc[:,1]
surv_col.head()

In [None]:
train_df = df.iloc[:,2:]
train_df.head()

In [None]:
pessengerId = test_df.iloc[:,0]
pessengerId.head()

In [None]:
test_df.drop(['PassengerId'],axis=1,inplace=True)

test_df.head()

In [None]:
train_df.head()

In [None]:
concated_df = pd.concat([train_df,test_df])

concated_df.head()

In [None]:
concated_df.info()

In [None]:
from sklearn import preprocessing as prep

# Label Encoding for Sex Column

In [None]:
le = prep.LabelEncoder()

concated_df.Sex =le.fit_transform(concated_df.Sex)

df.Sex[0:10]

In [None]:
concated_df.head()

In [None]:
concated_df.info()

### On the some columns, there are some missing values. Firstly I need to fill that columns

In [None]:
embarked = concated_df['Embarked'].fillna('0')

embarked.unique()

# Label Encoding for Embarked Column

In [None]:
concated_df.Embarked = le.fit_transform(embarked)

concated_df.Embarked.unique()

In [None]:
concated_df.head()

In [None]:
concated_df.tail()

In [None]:
concated_df.dtypes

In [None]:
print( 'Pclass:' ,concated_df.Pclass.unique())
print( 'Sex:' ,concated_df.Sex.unique())
print( 'SibSp:' ,concated_df.SibSp.unique())
print( 'Parch:' ,concated_df.Parch.unique())
print( 'Embarked:' ,concated_df.Embarked.unique())

### I will remove Cabin columns

In [None]:
concated_df.drop(['Cabin'],axis=1,inplace=True)

concated_df.head()

In [None]:
NameSplit = concated_df.Name.str.split('[,.]')

NameSplit.head()

In [None]:
titles = [str.strip(name[1]) for name in NameSplit.values]
titles[:10]

In [None]:
# new feature

concated_df['Title'] = titles

concated_df.head()

In [None]:
concated_df.Title.unique()

In [None]:
# useless words: I will combine Mademoiselle and Madame into a single type

concated_df.Title.values[concated_df.Title.isin(['Mme', 'Mmle'])] = 'Mmle'

In [None]:
# keep reducing

concated_df.Title.values[concated_df.Title.isin(['Capt', 'Don', 'Major', 'Sir'])] = 'Sir'
concated_df.Title.values[concated_df.Title.isin(['Dona', 'Lady', 'the Countess', 'Jonkheer'])] = 'Lady'

In [None]:
concated_df.Title.unique()

In [None]:
# label encode new feature too

concated_df.Title = le.fit_transform(concated_df.Title)
concated_df.head()

In [None]:
# new feature is family size
# number of spouses and siblings and oneself is family size

concated_df['FamilySize'] = concated_df.SibSp.values + concated_df.Parch.values + 1

In [None]:
concated_df.head()

In [None]:
surnames = [str.strip(name[0]) for name in NameSplit]
surnames[:10]

In [None]:
concated_df['Surname'] = surnames
concated_df['FamilyID'] = concated_df.Surname.str.cat(concated_df.FamilySize.astype(str),sep='')
concated_df.head()

In [None]:
# I will mark if any family id as small if family size is less than or equal to 2

concated_df.FamilyID.values[concated_df.FamilySize.values <= 2] = 'Small'

concated_df.head()

In [None]:
# check up the frequency of family ids
concated_df.FamilyID.value_counts()

## There are too many family ids with few family members. maybe some families had different last names. I'll clean this.

In [None]:
freq = list(dict(zip(concated_df.FamilyID.value_counts().index.tolist(), concated_df.FamilyID.value_counts().values)).items())

type(freq)

In [None]:
freq = np.array(freq)

freq[:10]

In [None]:
freq.shape

In [None]:
# select the family ids with frequency of 2 or less
freq[freq[:,1].astype(int) <= 2].shape

In [None]:
freq = freq[freq[:,1].astype(int) <= 2]

In [None]:
# I'll assign 'Small' for those
concated_df.FamilyID.values[concated_df.FamilyID.isin(freq[:,0])] = 'Small'
concated_df.FamilyID.value_counts()

In [None]:
# label encoding for family id

concated_df.FamilyID = le.fit_transform(concated_df.FamilyID)
concated_df.FamilyID.unique()

In [None]:
# I will choose usefull features
concated_reduce = concated_df[[
    'Pclass', 'Sex', 'Age', 'SibSp',
    'Parch', 'Fare', 'Title', 'Embarked', 'FamilySize',
    'FamilyID']]

concated_reduce.head()

In [None]:
concated_reduce.Age.unique()

In [None]:
concated_reduce.info()

## There are missing values on Age Column. Therefore I will fill taking Median

In [None]:
concated_reduce['Age'].fillna(concated_reduce['Age'].median(), inplace=True)
concated_reduce['Fare'].fillna(concated_reduce['Fare'].median(), inplace=True)

In [None]:
concated_reduce.info()

# So, That Dataset (concated_reduce) is ready for spliting as Train and Test values.

In [None]:
train_final = concated_reduce.iloc[:891].copy()
test_final = concated_reduce.iloc[891:].copy()

In [None]:
train_final.head()

In [None]:
test_final.head()

## At the outset, I splited some columns on the dataset. Now I will use that columns for creating Train dataset

In [None]:
X = train_final.values

X

In [None]:
y = surv_col.values

y

In [None]:
X.shape

In [None]:
y.shape

In [None]:
test_data = test_final.values

test_data

# Creating Neural Network with Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

In [None]:
model = Sequential()

model.add(Dense(32, init = 'uniform', activation='relu', input_dim = 10))
model.add(Dense(64, init = 'uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, init = 'uniform', activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(12, init = 'uniform', activation='relu'))
model.add(Dense(1, init = 'uniform', activation='sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.fit(X,y, epochs=500, batch_size = 64, verbose = 1)

In [None]:
pred = model.predict(test_data)

In [None]:
pred

In [None]:
# convert to integer
outputBin = np.zeros(0)
for i in pred:
    
    if i <= .5:
        
        outputBin = np.append(outputBin, 0)
    else:
        
        outputBin = np.append(outputBin, 1)
output = np.array(outputBin).astype(int)

In [None]:
output

In [None]:
d = {'PassengerId':pessengerId, 'Survived':output}

In [None]:
final_df = pd.DataFrame(data=d)

In [None]:
final_df.head()

In [None]:
final = final_df.to_csv('new_result.csv',index=False) #convert to csv file

final

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=350, max_depth=15, random_state=42)

print("train accuracy: {} ".format(rf.fit(X, y).score(X, y)))


In [None]:
rf_pred = rf.predict(test_data)

In [None]:
rf_pred

In [None]:
r = {'PassengerId':pessengerId, 'Survived':rf_pred}

In [None]:
final_rf = pd.DataFrame(data=r)

In [None]:
final_rf.head(13)

In [None]:
final_rf = final_df.to_csv('random_forest_result.csv',index=False) #convert to csv file

final_rf