### Loading data

In [2]:
import pandas as pd

train_data = pd.read_csv("titanic/train.csv")
test_data = pd.read_csv("titanic/test.csv")

In [3]:
train_data.describe()
# since 38 percent survived looking at mean then accuracy would be reasonable metric
# mean age was 30
# mean is an important parameter to note

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
train_data["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [5]:
train_data["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [6]:
train_data["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
train_data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
### modifying data

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns
# since scikit can't handle dataframe yet 
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [11]:
# for numerical attributes 
# and removing null values replacing by median

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
    ("imputer", SimpleImputer(strategy="median"))
])

In [12]:
num_pipeline.fit_transform(train_data)

array([[22.    ,  1.    ,  0.    ,  7.25  ],
       [38.    ,  1.    ,  0.    , 71.2833],
       [26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [28.    ,  1.    ,  2.    , 23.45  ],
       [26.    ,  0.    ,  0.    , 30.    ],
       [32.    ,  0.    ,  0.    ,  7.75  ]])

In [13]:
# for string attributes

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns )
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [14]:
# instead of LabelBinarizer we use onhot encoding


from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
    

In [15]:
cat_pipeline = Pipeline([
    ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
    ("imputer", MostFrequentImputer()),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])

In [17]:
cat_pipeline.fit_transform(train_data)

array([[0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [18]:
# joining numerical and text pipeline

from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [19]:
X_train = preprocess_pipeline.fit_transform(train_data)

In [20]:
y_train = train_data["Survived"]

In [21]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

### SVC prediction

In [22]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
X_test = preprocess_pipeline.transform(test_data)
y_pred = svm_clf.predict(X_test)

In [24]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train,y_train, cv=10)
svm_scores.mean()

0.7365250822835092

### Random Forest Classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)
y_pred_rfc = forest_clf.predict(X_test)

In [27]:
# cross validation
forest_scores = cross_val_score(forest_clf,X_train, y_train,cv=10)
forest_scores.mean()

0.8149526160481217

In [32]:
# exporting to Kaggle

submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": y_pred_rfc
})

submission.to_csv('rfc.csv', index=False)

# couldnt get more than 80 at kaggle

In [None]:
plt.figure(figsize=(8,4))
plt.plot([1]*10, svm_scores, ".")
plt.plot([2] *10, forest_scores,".")
plt.boxplot([svm_scores,forest_scores], labels=("SVM","Random Forest"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()

In [None]:
# other things to try
# compare more models and hyperparameters using grid search
# More feature engineering:
# replace SibSp and Parch with their sum
# train_data["sum"] = train_data["SibSp"] + train_data["Parch"]
# some names correlate to survival like Countess
# converting numerical to categorical attributes
# age group to age bucket

train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket","Survived"]].groupby(["AgeBucket"]).mean()

# or siblings on board
train_data["RelativesOnBoard"]= train_data["SibSp"] + train_data["Parch"]
train_data[["RealtivesOnBoard", "Survived"]].groupby(["RelativeOnBoard"]).mean()

# this is one way to look at correlation