In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing the libraries**

In [None]:
# Linear Algebra
import numpy as np

# Data processing
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style

# Machine Learning Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

# **Loading the Data**

In [None]:
test = pd.read_csv('../input/titanic/test.csv')
train = pd.read_csv('../input/titanic/train.csv')

In [None]:
test.head()
train.head()

# **Data Exploration/Analysis**

In [None]:
train.info()

The training set has 891 entries and 11 features + target variable.

**Features with short description:**
<br> 0   PassengerId  : Unique Id of the passenger </br>
 <br>1   Survived     : Survival </br>
 <br>2   Pclass       : passenger class (1st, 2nd, 3rd) </br> 
 <br>3   Name         : Name of the Passenger </br>
 <br>4   Sex          : Gender </br>
 <br>5   Age          : Age in years </br>
 <br>6   SibSp        : # of siblings / spouses aboard the Titanic </br>
 <br>7   Parch        : # of parents / childrens aboard the Titanic </br>
 <br>8   Ticket       : Ticket number </br>
 <br>9   Fare         : Passenger Fare </br>
 <br>10  Cabin        : Cabin Number </br>
 <br>11  Embarked     : Port of Embarkation </br>


In [None]:
train.describe()

Above we can **38.83% out of training-set surived the Titanic.**

In [None]:
train.head(8)

# **Missing Data**

In [None]:
total = train.isnull().sum().sort_values(ascending=False)
percent_1 = train.isnull().sum()/train.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

The **‘Cabin’** feature needs further investigation, but it looks like that we might want to drop it from the dataset, since **77 % of it are missing.**

In [None]:
train.columns.values

Above you can see the **11 features + the target variable (survived)**.

#  Age and Sex:

In [None]:
survived = 'survived'
not_survived = 'not_survived'
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
women = train[train['Sex'] == 'female']
men = train[train['Sex'] == 'male']
ax = sns.distplot(women[women['Survived'] == 1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde=False)
ax = sns.distplot(women[women['Survived'] == 0].Age.dropna(), bins=40, label = not_survived, ax =axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived'] == 1].Age.dropna(), bins=18, label = survived, ax =axes[1], kde=False)
ax = sns.distplot(men[men['Survived'] == 0].Age.dropna(), bins=18, label = not_survived, ax =axes[1], kde=False)
ax.legend()
ax.set_title('Male')

**Men** has **high probablity of survival** when they are between **18 to 30 years old**.For **women** the survival chances are higher between **14 and 40**.

**Note:** Infants also have a little bit higher probability of survival.

#  Embarked, Pclass and Sex:

In [None]:
FacetGrid = sns.FacetGrid(train, row='Embarked', size=4.5, aspect=1.6)
FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette=None,  order=None, hue_order=None )
FacetGrid.add_legend()

Embarked seems to be correlated with survival, depending on the gender.

#  Pclass

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train)

Pclass is contributing to a persons chance of survival, especially if this person is in class 1. 

In [None]:
grid = sns.FacetGrid(train, col = 'Survived', row = 'Pclass', size = 2.2, aspect = 1.6)
grid.map(plt.hist, 'Age', alpha = 0.5, bins = 20)
grid.add_legend()

from the above plot there is high probabilty that a person in Pclass will not survived.

# SibSp and Parch:

In [None]:
data =[train, test]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
    dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
    dataset['not_alone'] = dataset['not_alone'].astype(int)

train['not_alone'].value_counts()

In [None]:
axes = sns.factorplot('relatives', 'Survived', data=train, aspect=2.5)

**High probabilty** of survival with **1 to 3** relatives.

## Data Preprocessing

Drop **‘PassengerId’** from the train set, because it does not contribute to a persons survival probability.

In [None]:
train = train.drop(['PassengerId'], axis=1)

**Missing Data:**

In [None]:
import re
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F":6, "G":7, "U": 8}
data = [train, test]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group(0))
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)

# Drop the cabin feature
train = train.drop(['Cabin'], axis=1)
test = test.drop(['Cabin'], axis=1)
    

**Age:**

In [None]:
data = [train, test]

for dataset in data:
    mean = train["Age"].mean()
    std = test["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    
    # Fill NaN values in Age column with random value generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train["Age"].astype(int)
    
train["Age"].isnull().sum()

**Embarked:**

In [None]:
train['Embarked'].describe()

In [None]:
common_value = 'S'
data = [train, test]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

**Converting features:**

In [None]:
train.info()

'Fare' is float64, convert it to int64.

**Fare:**

Converting “Fare” from float to int64, using the “astype()” function

In [None]:
data = [train, test]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

**Name:**
Name feature to extract the Titles from the Name.

In [None]:
data = [train, test]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
Name = []

for dataset in data:
    
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)
    
train = train.drop(['Name'], axis = 1)
test = test.drop(['Name'], axis =1)

**Sex:**

Convert ‘Sex’ feature into numeric.

In [None]:
genders = {"male": 0, "female": 1}
data = [train, test]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

**Ticket:**

In [None]:
train['Ticket'].describe()

In [None]:
train = train.drop(['Ticket'], axis=1)
test = test.drop(['Ticket'], axis = 1)

**Embarked:**

Convert ‘Embarked’ feature into numeric.

In [None]:
ports = {"S": 0, "C":1, "Q":2}
data = [train, test]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

# **Creating Categories:**

**Age:**

In [None]:
data = [train, test]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

# let's see how it's distributed
train['Age'].value_counts()

**Fare:**

In [None]:
train.head(10)

In [None]:
data = [train, test]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)

# **Creating new Features**

**1. Age times Class**

In [None]:
data = [train, test]
for dataset in data:
    dataset['Age_Class'] = dataset['Age']*dataset['Pclass']

**2. Fare per Person**

In [None]:
for dataset in data:
    dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1)
    dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)
    

In [None]:
# Let's take a last look at the training set, before we start training the models.
train.head(10)

# **Building Machine Learning Models**

In [None]:
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test = test.drop("PassengerId", axis=1).copy()

**Stochastic Gradient Descent (SGD)**

In [None]:
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train)* 100, 2)

**Random Forest**

In [None]:
random_forest = RandomForestClassifier(n_estimators = 150)
random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest= round(random_forest.score(X_train, Y_train)* 100, 2)

**Logistic Regression**

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_train)

logreg.score(X_train, Y_train)
acc_logreg = round(logreg.score(X_train, Y_train)* 100, 2)

**K Nearest Neighbor**

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)

knn.score(X_train, Y_train)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)

**Gaussian Naive Bayes**

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
gaussian.score(X_train, Y_train)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

**Perceptron**

In [None]:
perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, Y_train)

Y_pred = perceptron.predict(X_test)

acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)

**Linear Support Vector Machine**

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

**Decision Tree**

In [None]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train) 

Y_pred = decision_tree.predict(X_test)

acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

#  **Best Model**

In [None]:
results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_logreg, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

# **K-Fold Cross Validation**

In [None]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

# **Summary**


We started with  data exploration. There, I got a feel for the dataset, looked for the missing data, and learned which features were important. This process  used Seaborn and Matplotlib for visualization. During the  preprocessing of the data, we calculated the missing values, converted the features into numbers, grouped the values into categories, and created some new features. We then started training eight different machine learning models, selecting one of them (random forest) and applying cross validation.