In [21]:
import pandas as pd
from pandas import DataFrame
from sklearn import linear_model, preprocessing, tree, model_selection

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train = train.drop(columns = ['Name','Ticket','Cabin'])

In [4]:
train.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S
5,6,0,3,male,,0,0,8.4583,Q


### Limpando os dados

In [5]:
train.isnull().sum() # precisamos preencher as colunas que possuem valores faltantes

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [6]:
train['Fare'] = train.Fare.fillna(train.Fare.dropna().median())
train ['Age'] = train.Age.fillna(train.Age.dropna().median())

train.loc[train['Sex'] == 'male', 'Sex'] = 0
train.loc[train['Sex'] == 'female', 'Sex'] = 1

train['Embarked'] = train.Embarked.fillna('S')

train.loc[train['Embarked'] == 'S', 'Embarked'] = 0
train.loc[train['Embarked'] == 'C', 'Embarked'] = 1
train.loc[train['Embarked'] == 'Q', 'Embarked'] = 2

In [7]:
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

### Regressão Linear

In [8]:
target = train['Survived'].values

In [9]:
features = train[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']].values

In [10]:
classifier = linear_model.LogisticRegression()
classifier_ = classifier.fit(features, target)



In [11]:
print(classifier_.score(features, target))

0.7991021324354658


### Regressão polinomial

In [12]:
poly = preprocessing.PolynomialFeatures(degree = 2)
poly_features = poly.fit_transform(features)
classifier_ = classifier.fit(poly_features, target)
classifier_.score(poly_features,target)



0.8361391694725028

### Decision Tree

In [16]:
target = train.Survived.values
feature_names = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
features = train[feature_names].values

In [20]:
decision_tree = tree.DecisionTreeClassifier(random_state = 1)
decision_tree_ = decision_tree.fit(features, target)
decision_tree_.score(features,target)

0.9797979797979798

#### Usando o model selection

In [22]:
scores = model_selection.cross_val_score(decision_tree, features, target, scoring = 'accuracy', cv = 50)
scores

array([0.66666667, 0.61111111, 0.66666667, 0.88888889, 0.94444444,
       0.94444444, 0.72222222, 0.77777778, 0.72222222, 0.77777778,
       0.72222222, 0.61111111, 0.72222222, 0.77777778, 0.55555556,
       0.83333333, 1.        , 0.66666667, 0.77777778, 0.77777778,
       0.88888889, 0.77777778, 0.88888889, 0.72222222, 0.55555556,
       0.83333333, 0.94444444, 0.88888889, 0.66666667, 0.83333333,
       0.72222222, 0.66666667, 0.88888889, 0.94444444, 0.88888889,
       0.77777778, 0.72222222, 0.72222222, 0.72222222, 0.77777778,
       0.88888889, 0.83333333, 0.76470588, 0.88235294, 0.70588235,
       0.76470588, 0.82352941, 0.82352941, 0.88235294, 0.875     ])

In [23]:
scores.mean()

0.7848856209150326

### Tentando consertar o decision tree

In [24]:
generalized_tree = tree.DecisionTreeClassifier(random_state = 1, 
                                            max_depth = 7,
                                            min_samples_split = 2)
generalized_tree_ = generalized_tree.fit(features, target)
generalized_tree_.score(features,target)

0.8787878787878788

In [25]:
scores = model_selection.cross_val_score(decision_tree, features, target, scoring = 'accuracy', cv = 50)
scores

array([0.66666667, 0.61111111, 0.66666667, 0.88888889, 0.94444444,
       0.94444444, 0.72222222, 0.77777778, 0.72222222, 0.77777778,
       0.72222222, 0.61111111, 0.72222222, 0.77777778, 0.55555556,
       0.83333333, 1.        , 0.66666667, 0.77777778, 0.77777778,
       0.88888889, 0.77777778, 0.88888889, 0.72222222, 0.55555556,
       0.83333333, 0.94444444, 0.88888889, 0.66666667, 0.83333333,
       0.72222222, 0.66666667, 0.88888889, 0.94444444, 0.88888889,
       0.77777778, 0.72222222, 0.72222222, 0.72222222, 0.77777778,
       0.88888889, 0.83333333, 0.76470588, 0.88235294, 0.70588235,
       0.76470588, 0.82352941, 0.82352941, 0.88235294, 0.875     ])

In [26]:
scores.mean()

0.7848856209150326

In [28]:
tree.export_graphviz(generalized_tree_, feature_names = feature_names, out_file = 'tree.dot')