**Part one: Read in and explore the data**

In [None]:
import numpy as np 
import pandas as pd 
data = pd.read_csv("../input/Titanictrain.csv")
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
data.count()

In [None]:
print("Number of Males who survived:", data["Name"][(data['Sex']=="male") & (data['Survived']== 1)].count())
print("Number of Females who survived:", data["Name"][(data['Sex']=="female") & (data['Survived']== 1)].count())
print("Number of Females who survived:", data["Name"][(data['Survived']== 1)].count())

In [None]:
dfclasssex = data.groupby(["Pclass",'Sex'])['Sex'].count()
dfclasssex

In [None]:
dfclss = data.groupby(["Pclass",'Sex','Survived'])['Sex'].count()
dfclss

**Part 2 Data Visualization**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
#percentages of females vs. males that survive
sns.barplot(x="Sex", y="Survived",data=data)
print("Percentage of females who survived:", data["Survived"][data["Sex"] == 'female'].value_counts(normalize = True)[1]*100)
print("Percentage of males who survived:", data["Survived"][data["Sex"] == 'male'].value_counts(normalize = True)[1]*100)

In [None]:
 #According to the visualization, females have a much higher chance of survival than males. 

In [None]:
#survive rate based on Pclass
sns.barplot(x="Pclass", y="Survived", data=data)
print("Percentage of Pclass = 1 who survived:", data["Survived"][data["Pclass"] == 1].value_counts(normalize = True)[1]*100)
print("Percentage of Pclass = 2 who survived:", data["Survived"][data["Pclass"] == 2].value_counts(normalize = True)[1]*100)
print("Percentage of Pclass = 3 who survived:", data["Survived"][data["Pclass"] == 3].value_counts(normalize = True)[1]*100)

In [None]:
# According to the graph, people in higher class have a higher chance of survival. 

In [None]:
#survive rate based on SibSp
sns.barplot(x="SibSp", y="Survived", data=data)
print("Percentage of SibSp = 0 who survived:", data["Survived"][data["SibSp"] == 0].value_counts(normalize = True)[1]*100)
print("Percentage of SibSp = 1 who survived:", data["Survived"][data["SibSp"] == 1].value_counts(normalize = True)[1]*100)
print("Percentage of SibSp = 2 who survived:", data["Survived"][data["SibSp"] == 2].value_counts(normalize = True)[1]*100)

In [None]:
# According to graph, people who have more siblings or spouses aboard were less likely to survive.
# People with no siblings or spouses were less likely to survive than people who have one or two. 

In [None]:
#survive rate based on parch
sns.barplot(x="Parch", y="Survived",data=data)
plt.show()

In [None]:
#Accorning to the graph, people with less than four parents or children aboard are more likely to survive than people who have more than four or more. 
#People traveling alone are less likely to survive than those with 1-3 parents or children. 

In [None]:
#survive rate according to age
data["Age"] = data["Age"].fillna(-0.5)
bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'New Adult', 'Young Adult', 'Adult', 'Senior']
data['AgeGroup'] = pd.cut(data["Age"], bins, labels = labels)
sns.barplot(x="AgeGroup", y="Survived", data=data)
plt.show()

In [None]:
#According to the graph, babies are more likely to survive than any other age group. 

**Part 3 Predictive Modeling**

In [None]:
# Recode
data['Sex'] = data['Sex'].apply(lambda x: "1" if x=="female" else "0")
#Generate Row ID and replace the names
data["rowid"] = data.index
data.head()
data.index = data["Name"]
data = data.drop("Name", axis=1)
data.head()

In [None]:
data = data.drop("Fare", axis=1)
data = data.drop("Embarked", axis=1)
data = data.drop("AgeGroup", axis=1)
data.head()

In [None]:
data = data[["rowid","Survived","Sex","Age","Pclass","SibSp","Parch",]]
data.head()

In [None]:
data["complete"] = data["Age"].apply(lambda x:0 if np.isnan(x) else 1 )
print(data.loc[data["complete"]==0].head())
data.head()

In [None]:
# Random Forest

In [None]:
features = ["Pclass","Sex","Age"]

In [None]:
print(data[features].head())

In [None]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(data[features])

In [None]:
data[features] = imp.transform(data[features])

In [None]:
print(data[(data['complete']==0)].head())

In [None]:
from sklearn.ensemble import RandomForestClassifier
print("Train a Random Forest model without NA and factorized data")
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion="entropy", random_state=1)
clf=rf.fit(data[features], data['Survived'])
print(clf)

In [None]:
importances = clf.feature_importances_
print(importances)

In [None]:
# age plays the most important role among passengerclass, sex and age.

In [None]:
import matplotlib.pyplot as plt
variables = clf.feature_importances_
features
indices = np.argsort(importances)
plt.barh(indices,variables,color='r')
plt.yticks(indices,features)
plt.show()

In [None]:
#Logistic Regression

In [None]:
import sklearn.linear_model as lm
import sklearn.metrics as mm 
print("Train a Logistic Regression model")
glm = lm.LogisticRegression()
clf_glm =glm.fit(data[features], data["Survived"])
prd_lr = pd.DataFrame(clf_glm.predict_proba(data[features])[:,1])
data['prd.lr'] = prd_lr.values

frames = [data]
result = pd.concat(frames)
print('Coefficients: \n', clf_glm.coef_)
print("Mean squared error: %.2f"
      % mm.mean_squared_error(result['Survived'], result['prd.lr']))
print('Variance score: %.2f' % mm.r2_score(result['Survived'], result['prd.lr']))

In [None]:
#Naive Bayse

In [None]:
from sklearn.naive_bayes import GaussianNB
print("Train a Naive Bayse model")
clfnb = GaussianNB()
clfnb.fit(data[features], data["Survived"])

prd_nb = pd.DataFrame(clfnb.predict_proba(data[features])[:,1])
data['prd.nb'] = prd_nb.values

#Merge Test and train
frames = [data]
result = pd.concat(frames)
result.head()


In [None]:
#AdaBoosted decision Tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
print("Train a Adaboost model")
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)

bdt.fit(data[features], data["Survived"])

prd_ada = pd.DataFrame(bdt.predict_proba(data[features])[:,1])
data['prd.ada'] = prd_ada.values

frames = [data]
result = pd.concat(frames)
result.head()

In [None]:
#Support Vector Machine
from sklearn.svm import SVC
clfsvm = SVC(probability=True)
clfsvm.fit(data[features], data["Survived"])

prd_svm = pd.DataFrame(clfsvm.predict_proba(data[features])[:,1])
data['prd.svm'] = prd_svm.values


frames = [data]
result = pd.concat(frames)
result.head()