In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from matplotlib import pyplot as plt # Graf
import seaborn as sns

from sklearn.preprocessing import LabelEncoder # Encoder

SEED = 1

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('../input/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/test.csv')
test.head()

In [None]:
train.info()

In [None]:
datasets = [train,test]

In [None]:
for df in datasets:
    df['Title'] = df.Name.str.extract('([A-Za-z]+)\.', expand=False)
    
train.Title.value_counts(dropna=False)

In [None]:
def encTitle(title):
    title = str(title)
    if title in ('Mr'):
        return 1
    elif title in ('Miss','Mrs'):
        return 2
    elif title in ('Master','Sir'):
        return 3
    elif title in ('Rev','Jonkheer','Don','Countess','Col'):
        return 4
    elif title in ('Dr'):
        return 5
    elif title in ('Major','Capt'):
        return 6
    elif title in ('Lady','Mlle','Don'):
        return 7
    elif title in ('Mme','Ms'):
        return 8
    else:
        return 9

In [None]:
for df in datasets:
    df['TitleEnc'] = df['Title'].apply(encTitle)
    
train.TitleEnc.value_counts(dropna=False)

In [None]:
for df in datasets:
    df['hasCabin'] = np.where(pd.isnull(df['Cabin']),0,1)
    df.loc[pd.isnull(df['Embarked']),'Embarked'] = 'None'
    df.drop(['Title','Name','Ticket','Cabin'],axis=1,inplace=True)
    
train.head()

In [None]:
np.random.seed(SEED)
le = dict()
le['Sex'] = LabelEncoder()
le['Sex'].fit(train.Sex)
le['Embarked'] = LabelEncoder()
le['Embarked'].fit(train.Embarked)

for df in datasets:
    df['Sex'] = le['Sex'].transform(df['Sex'])
    df['Embarked'] = le['Embarked'].transform(df['Embarked'])
    
train.head()

In [None]:
for df in datasets:
    df['Family'] = np.where((df['SibSp'] > 1),0,np.where((df['Parch'] > 1),2,1))
    
print('Sobreviventes:\n',train.groupby(['Family'])['Survived'].mean())

In [None]:
titles = train.TitleEnc.unique()
family = train.Family.unique()
titles.sort()
family.sort()

for f in family:
    for title in titles:
        for df in datasets:
            df.loc[(pd.isnull(df['Age'])) & 
                   (df['TitleEnc'] == title) &
                   (df['Family'] == f), 'Age'] = df[(df['TitleEnc'] == title) & (df['Family'] == f)]['Age'].mean()

for df in datasets:
    df.loc[:,'Age'] = np.round(df['Age'])
            
train.info()

In [None]:
for df in datasets:
    df.loc[pd.isnull(df['Fare']),'Fare'] = df['Fare'].mean()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train.corr(), annot=True)
plt.show()

In [None]:
print('Sobreviventes:\n',train.groupby(['TitleEnc'])['Survived'].mean())

In [None]:
def newTitleEnc(n):
    if n in (1,4,6):
        return 1
    if n in (2,3,5):
        return 2
    if n in (7,8):
        return 3
    return 1

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train.corr(), annot=True)
plt.show()

In [None]:
for df in datasets:
    df.loc[:,'TitleEnc'] = df['TitleEnc'].apply(newTitleEnc)
    
print('Sobreviventes:\n',train.groupby(['TitleEnc'])['Survived'].mean())

In [None]:
def ageGroup(age):
    if age < 18:
        return 1
    if age < 60:
        return 2
    return 3

In [None]:
for df in datasets:
    df['AgeGroup'] = df['Age'].apply(ageGroup)
    
print('Sobreviventes:\n',train.groupby(['AgeGroup'])['Survived'].mean())

In [None]:
for df in datasets:
    df['Preference'] = np.where(df['TitleEnc']
    df['Preference'] = df['Preference'] * df['Sex']
    df['Preference'] = df['Preference'] // df['Pclass']
    df['Preference'] = df['Preference'] // df['AgeGroup']

print('Sobreviventes:\n',train.groupby(['Preference'])['Survived'].mean())

In [None]:
for df in datasets:
    df.drop(['Age','SibSp','Parch'],axis=1,inplace=True)
    
train.head()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train.corr(), annot=True)
plt.show()

In [None]:
train.describe()

In [None]:
def show_results(results):
  media = results['test_score'].mean()
  desvio_padrao = results['test_score'].std()
  print("mean Accuracy: %.2f" % (media * 100))
  print("Accuracy: [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

In [None]:
x_train = train.drop(['PassengerId','Survived'],axis=1)
y_train = train['Survived']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

tree = DecisionTreeClassifier(max_depth=3,random_state=0)
cv = GroupKFold(n_splits = 2)
results = cross_validate(tree, x_train, y_train, cv = cv, groups = train.TitleEnc.values, return_train_score=False)
show_results(results)

In [None]:
tree.fit(x_train,y_train)

print(tree.score(x_train,y_train))

In [None]:
import graphviz
from sklearn.tree import export_graphviz

features = x_train.columns

dot_data = export_graphviz(tree, out_file=None, filled = True, rounded = True,
                           feature_names = features,
                          class_names = ["Morre", "Sobrevive"])
graph = graphviz.Source(dot_data)
graph

In [None]:
test.info()

In [None]:
x_test = test.drop(['PassengerId'],axis=1)
y_pred = tree.predict(x_test)

my_submission = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_pred})

my_submission.head()

In [None]:
my_submission.to_csv('submission.csv', index=False)