## Titanic Original DataSet 

Dataset downloaded from 
biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
titanic = pd.read_csv('./Data/OriginalTitanicDataSet.csv')
print (len(titanic))
print (titanic.columns)

1309
Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')


In [None]:
titanic.describe()

In [None]:
sum(titanic.age.isnull())

In [None]:
t2 = titanic[~titanic['age'].isna()]
t3 = t2[~t2.fare.isna()]

In [None]:
print (len(t3))

In [None]:
t4 = titanic.dropna(axis=0)

In [None]:
print (len(t4))

In [None]:
a = t3.groupby(['survived'])

In [None]:
#titanic.sns.boxplot(column='age', by=['survived'])

sns.boxplot(x='survived', y='age', data=titanic)

sns.boxplot(x='survived', y='pclass', data=titanic)

sns.boxplot(x='pclass', y='survived', data=titanic)

sns.barplot(x='pclass', y='survived', data=titanic)

## Exploratory analysis

### Check the number of passengers & demography

In [None]:
print (len(t3))

In [None]:
print (t3.groupby(['sex'])['name'].count())
print (len(t3))
t3.groupby(['sex'])['name'].count().plot(kind='pie', autopct='%1.0f%%')
plt.ylabel('gender')
plt.axis('equal')
plt.show()

In [None]:
print (t3.groupby(['survived'])['name'].count())
t3.groupby(['survived'])['name'].count().plot(kind='pie', autopct='%1.0f%%')
plt.ylabel('survived')
plt.axis('equal')
plt.show()

In [None]:
print (t3.groupby(['pclass'])['name'].count())
t3.groupby(['pclass'])['name'].count().plot(kind='pie', autopct='%1.0f%%')
plt.ylabel('pclass')
plt.axis('equal')
plt.show()

In [None]:
abins = pd.IntervalIndex.from_tuples([(0, 20), (20, 40), (40, 60), (60, 80), (80,100)])
t3['agebins'] = pd.cut(t3['age'], abins)

In [None]:
print (t3.groupby(['agebins'])['name'].count())
t3.groupby(['agebins'])['name'].count().plot(kind='pie', autopct='%1.0f%%')
plt.ylabel('age groups')
plt.axis('equal')
plt.show()

In [None]:
print (t3.groupby(['sibsp'])['name'].count())
t3.groupby(['sibsp'])['name'].count().plot(kind='bar')
plt.show()

In [None]:
print (t3.groupby(['parch'])['name'].count())
t3.groupby(['parch'])['name'].count().plot(kind='bar')
plt.show()

## Multivariate analysis

In [None]:
t3.fare.describe()

In [None]:
t3['relations'] = 0
t3.loc[t3['parch'] > 0, 'relations'] = 1
t3.loc[t3['sibsp'] > 0, 'relations'] = 1

In [None]:
t3.relations.sum()

In [None]:
print (t3.groupby(['relations', 'survived'])['name'].count())

In [None]:
t31 = t3.groupby(['relations', 'survived'])['name'].count().unstack().plot(kind='bar', stacked=True)
plt.xlabel('relations')

In [None]:
t3.groupby(['pclass', 'survived'])['name'].count().unstack().plot(kind='bar', stacked=True)
plt.show()

In [None]:
t3.groupby(['agebins', 'survived'])['name'].count().unstack().plot(kind='bar', stacked=True)
plt.show()

In [None]:
farebins = pd.IntervalIndex.from_tuples([(0, 10), (10, 30), (30, 100), (100, 800)])
t3['farebins'] = pd.cut(t3['fare'], farebins)

In [None]:
t3.groupby(['farebins', 'survived'])['name'].count().unstack().plot(kind='bar', stacked=True)
plt.show()

## Building a simple model

In [None]:
t3.to_csv('ModelingDataTitanic.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [None]:
#t3t = t3.drop(['survived', 'agebins', 'farebins', 'name', ''], axis=1)
t3t = t3[['pclass', 'sibsp', 'fare', 'relations', 'sex', 'parch', 'agebins', 'farebins']]

In [None]:
t3t.head()

In [None]:
def ttsplit(features):
    feats = t3t[features]
    idvs = pd.get_dummies(feats)
    X_train, X_test, y_train, y_test = train_test_split(idvs, t3['survived'], test_size=0.33, random_state=42)
    return (X_train, X_test, y_train, y_test)
    

def svc_model(features):
    X_train, X_test, y_train, y_test = ttsplit(features)
    clf = SVC(gamma='auto')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    return (score)  



In [None]:
f = ['pclass']
print (svc_model(f))

In [None]:
f = ['pclass', 'sex']
print (svc_model(f))

In [None]:
f = ['pclass', 'sex', 'sibsp']
print (svc_model(f))

In [None]:
f = ['pclass', 'sex', 'sibsp', 'parch']
print (svc_model(f))

In [None]:
f = ['pclass', 'sex', 'sibsp', 'parch', 'relations']
print (svc_model(f))

In [None]:
f = ['pclass', 'sex', 'sibsp', 'parch', 'relations', 'agebins']
print (svc_model(f))

In [None]:
f = ['pclass', 'sex', 'sibsp', 'parch', 'relations', 'agebins', 'farebins']
print (svc_model(f))

## Training another model  - Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
def knnmodel(features):
    X_train, X_test, y_train, y_test = ttsplit(features)
    neigh = KNeighborsClassifier(n_neighbors=15)    
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    return (score)  


In [None]:
f = ['pclass', 'sex', 'sibsp', 'parch', 'relations', 'agebins', 'farebins']
print (knnmodel(f))

## Training another model - Random forest

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

def treemodel(features):
    X_train, X_test, y_train, y_test = ttsplit(features)
    clf = DecisionTreeClassifier(random_state=0)
    print (cross_val_score(clf, X_train, y_train, cv=10))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    return (score)  
    

In [None]:
treemodel(f)