In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [4]:
df = pd.read_csv("../data/titanic.csv")
predictors = ['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
label = 'Survived'
cabin_fillna = 'NA'
df.Cabin = df.Cabin.fillna(cabin_fillna)

df_train, df_test, y_train, y_test = train_test_split(df[predictors], df[label], test_size=0.20, random_state=42)

In [6]:
age_fillna = df_train.Age.mean()
embarked_fillna = df_train.Embarked.value_counts().index[0]


df_train.Age = df_train.Age.fillna(df.Age.mean())
df_train.Embarked = df_train.Embarked.fillna(embarked_fillna)

df_test.Age = df_test.Age.fillna(df.Age.mean())
df_test.Embarked = df_test.Embarked.fillna(embarked_fillna)

In [7]:
le = dict()
for column in df_train.columns:
    if df_train[column].dtype == np.object:
        le[column] = LabelEncoder()
        df_train[column] = le[column].fit_transform(df_train[column])
        
for column in df_test.columns:
    if df_test[column].dtype == np.object:
        df_test[column] = le[column].transform(df_test[column])

In [10]:
for col in df_train.columns:
    print(col, df_train[col].unique())

Pclass [1 2 3]
Sex [1 0]
Age [ 45.5         23.          32.          26.           6.          24.          45.
  29.          29.69911765  42.          36.          33.          17.          50.
  35.          38.          34.          11.          61.          30.           7.
  63.          20.          27.           2.          25.          51.          18.
  22.           0.92        21.          62.          57.          19.
  34.5         28.          52.          36.5         40.          65.           4.
   1.          43.          31.          39.          49.           3.
  40.5         54.          16.          47.          60.          44.           8.
  15.          41.           9.          37.          46.          56.          59.
  58.          28.5          0.75        55.          14.          12.
  70.5         70.          48.          80.          55.5         14.5
  10.          53.          32.5         74.          64.           5.
  24.5          0.42       

In [5]:
model = RandomForestClassifier(n_estimators=25, random_state=42)
model.fit(X=df_train, y=y_train)
# y_pred = model.predict(X=df_test)
# print(confusion_matrix(y_test, y_pred))
# print(f1_score(y_test, y_pred))
# from sklearn.externals import joblib
# joblib.dump(model, './model/model.pkl') 

['./model/model.pkl']

In [9]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
709,3,1,29.699118,1,1,15.2458,0
439,2,1,31.0,0,0,10.5,2
840,3,1,20.0,0,0,7.925,2
720,2,0,6.0,0,1,33.0,2
39,3,0,14.0,1,0,11.2417,0


In [29]:
predict_request = [3, 0, 6.0, 0, 1, 33.0, 2]
predict_request = np.array(predict_request).reshape(1, -1)
model.predict_proba(predict_request)[0][1]

0.47999999999999998

In [10]:
y_test.head()

709    1
439    0
840    0
720    1
39     1
Name: Survived, dtype: int64

In [19]:
y_pred_prob = model.predict_proba(df_test)
y_pred_prob

array([[ 0.56      ,  0.44      ],
       [ 0.96      ,  0.04      ],
       [ 0.91333333,  0.08666667],
       [ 0.        ,  1.        ],
       [ 0.64      ,  0.36      ],
       [ 0.08      ,  0.92      ],
       [ 0.17735498,  0.82264502],
       [ 0.93333333,  0.06666667],
       [ 0.24      ,  0.76      ],
       [ 0.04      ,  0.96      ],
       [ 0.72      ,  0.28      ],
       [ 0.92      ,  0.08      ],
       [ 0.96      ,  0.04      ],
       [ 1.        ,  0.        ],
       [ 0.82866667,  0.17133333],
       [ 0.        ,  1.        ],
       [ 0.88      ,  0.12      ],
       [ 0.05142857,  0.94857143],
       [ 0.92      ,  0.08      ],
       [ 0.88      ,  0.12      ],
       [ 1.        ,  0.        ],
       [ 0.57333333,  0.42666667],
       [ 0.76      ,  0.24      ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.96      ,  0.04      ],
       [ 0.76      ,  0.24      ],
       [ 0.952     ,  0.048     ],
       [ 0.88      ,

In [15]:
y_pred = model.predict(df_test)
y_pred[:10]

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1], dtype=int64)