## Resolução do problema da plataforma kaggle:
    https://www.kaggle.com/c/titanic
### Resolução do mesmo problemas utilizando outras técnicas no GEAM:
    https://github.com/ciencia-de-dados-pratica/GEAM/blob/master/011/titanic.ipynb

## Importação de bibliotecas

In [35]:
import pandas as pd
from sklearn import tree
from sklearn import model_selection
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors

## Importação do arquivo de treino e visualização dos dados

In [36]:
data = pd.read_csv("train.csv")

In [37]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [39]:
data.shape

(891, 12)

## Pré-processamento

In [40]:
columns=['Pclass', 'Sex', 'Age', 'Embarked', 'Survived']

In [41]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [42]:
data = data[columns]

In [43]:
data = data.fillna(data.mean())

In [44]:
data = pd.get_dummies(data, drop_first = True)

In [45]:
data.head()

Unnamed: 0,Pclass,Age,Survived,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,0,1,0,1
1,1,38.0,1,0,0,0
2,3,26.0,1,0,0,1
3,1,35.0,1,0,0,1
4,3,35.0,0,1,0,1


## Criação dos dados de treino e teste para avaliação dos modelos

In [46]:
X = data.drop(['Survived'], axis=1)
y = data['Survived']

In [47]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=1)

## Treino e teste de diferentes modelos

## DecisionTreeClassifier

In [48]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train);
resposta_teste = clf.predict(X_test)
print(metrics.classification_report(y_test, resposta_teste))
accuracy = metrics.accuracy_score(y_test, resposta_teste)
print(accuracy)

             precision    recall  f1-score   support

          0       0.75      0.84      0.80       153
          1       0.75      0.63      0.69       115

avg / total       0.75      0.75      0.75       268

0.753731343283582


## GradientBoostingClassifier

In [49]:
modeloGBC = GradientBoostingClassifier()
modeloGBC.fit(X_train, y_train)
resposta_gbc = modeloGBC.predict(X_test)
print(metrics.classification_report(y_test, resposta_gbc))
accuracy_gbc = metrics.accuracy_score(y_test, resposta_gbc);
print('accuracy: ')
print(accuracy_gbc)

             precision    recall  f1-score   support

          0       0.74      0.89      0.81       153
          1       0.80      0.59      0.68       115

avg / total       0.77      0.76      0.75       268

accuracy: 
0.7611940298507462


## LogisticRegression

In [50]:
lr = linear_model.LogisticRegression()
lr.fit(X_train, y_train)
resposta_lr = lr.predict(X_test)
print(metrics.classification_report(y_test, resposta_lr))
accuracy_lr = metrics.accuracy_score(y_test, resposta_lr);
print('accuracy: ')
print(accuracy_lr)

             precision    recall  f1-score   support

          0       0.76      0.86      0.81       153
          1       0.77      0.64      0.70       115

avg / total       0.77      0.76      0.76       268

accuracy: 
0.7649253731343284


## KNN

In [51]:
knn = neighbors.KNeighborsClassifier()
knn.fit(X_train, y_train)
resposta_knn = knn.predict(X_test)
print(metrics.classification_report(y_test, resposta_knn))
accuracy_knn = metrics.accuracy_score(y_test, resposta_knn);
print('accuracy: ')
print(accuracy_knn)

             precision    recall  f1-score   support

          0       0.76      0.87      0.81       153
          1       0.78      0.63      0.70       115

avg / total       0.77      0.76      0.76       268

accuracy: 
0.7649253731343284


## Criação do modelo definitivo

In [52]:
test = pd.read_csv('test.csv')

In [53]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [54]:
passengerId = test['PassengerId']

In [55]:
columns=['Pclass', 'Sex', 'Age', 'Embarked']

In [56]:
test = test[columns]

In [57]:
test.fillna(test.mean(), inplace = True)

In [58]:
test = pd.get_dummies(test, drop_first = True)

In [59]:
test.head()

Unnamed: 0,Pclass,Age,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,1,1,0
1,3,47.0,0,0,1
2,2,62.0,1,1,0
3,3,27.0,1,0,1
4,3,22.0,0,0,1


In [60]:
modelo_definitivo = GradientBoostingClassifier()
modelo_definitivo.fit(X, y)
resposta_definitiva = modeloGBC.predict(test.values)

In [61]:
answer = pd.DataFrame()
answer['PassengerId'] = passengerId
answer['Survived'] = resposta_definitiva

In [62]:
answer.to_csv("answer.csv", index=False)