In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [3]:
df_train.shape

(891, 12)

In [4]:
df_test.shape

(418, 11)

In [5]:
#You may want to check if there are features you can supply a dummy value instead of dropping
df_train = df_train.dropna()
#seperate the X (features) and Y (prediction)
x_feats = list(df_train.columns)
#Dropping all categorical variables
#this is just a dummy method, you don't have to drop so many/any columns. 
#Infact you may want to use some of these columns to engineer other features
for f in ['Survived', 'Name', 'Ticket', 'Sex', 'Cabin', 'Embarked']:
  x_feats.remove(f)

x_train = df_train[x_feats]
y_train = df_train['Survived']

In [6]:
x_train.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
1,2,1,38.0,1,0,71.2833
3,4,1,35.0,1,0,53.1
6,7,1,54.0,0,0,51.8625
10,11,3,4.0,1,1,16.7
11,12,1,58.0,0,0,26.55


In [7]:
y_train

1      1
3      1
6      0
10     1
11     1
      ..
871    1
872    0
879    1
887    1
889    1
Name: Survived, Length: 183, dtype: int64

In [8]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
accuracy_score(clf.predict(x_train), y_train) #overfitting

1.0

In [9]:
#use validation
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(x_train,y_train)

In [10]:
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
accuracy_score(clf.predict(X_train), Y_train) #overfitting

1.0

In [11]:
accuracy_score(clf.predict(X_val), Y_val) #this is more representative of your kaggle score

0.6304347826086957

In [12]:
#IMP: YOU DONT HAVE TO DO THIS. You can try other more more useful replacement for na. But don't drop na
df_test = df_test.fillna(0)
df_test = df_test[x_feats]
clf.predict(df_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [13]:
df_test.shape

(418, 6)

In [14]:
submission = pd.DataFrame(index=df_test.PassengerId)
submission['Survived'] = clf.predict(df_test)

In [15]:
submission.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [17]:
submission.reset_index().to_csv('submission.csv', index=False)