# Titanic: Machine Learning from Disaster.
## The Importance of the Family

### Import Libraries

In [None]:
import pandas as pd
import numpy as np

### 1. Data Collection(data gathered).

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
test['Survived'] = np.nan

In [None]:
data=train.append(test,ignore_index=True)

### 2. Data analysis 

#### 2.1. Data info.

In [None]:
print(data.columns.values)

In [None]:
data.info()

#### 2.2. Missing Data


In [None]:
data.isnull().mean()

#### 2.3. Features analysis

2.1 PassangerId. No used.

2.2 Survived. Target Feature.

2.3 Fare. New Fare_bin feature created after Fare-Survived relationship analysis.

In [None]:
##Add missing fare to passangerID 1044
data['Fare'].fillna(8, inplace = True) 
data['Fare_bin'] = pd.qcut(data['Fare'], 4,labels=('Fare_bin1','Fare_bin2','Fare_bin3','Fare_bin4'))

2.4 Name. New Title feature created after Name-Survived relationship analysis.

In [None]:
data['Title']=data['Name'].str.split(', ').str[1].str.split('.').str[0]
data['Title_bin'] = data['Title']
data['Title_bin'] = data['Title_bin'].replace(['Capt','Don','Jonkheer','Rev','Mr'], 'Title_bin1')
data['Title_bin'] = data['Title_bin'].replace(['Dr','Col','Major','Master'], 'Title_bin2')
data['Title_bin'] = data['Title_bin'].replace(['Miss'], 'Title_bin3')
data['Title_bin'] = data['Title_bin'].replace(['Mrs','Mme','Sir','Ms','Lady','Mlle','the Countess','Dona'], 'Title_bin4')

2.5 Sex. Feature ok. 

2.6 Age. Set missing values and define New Age bins feature.

In [None]:
def Cal_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [None]:
data['Age'] = data[['Age','Title']].apply(Cal_age,axis=1)
data['Age_bin'] = pd.cut(data['Age'].astype(int), 5, labels=('Age_bin1','Age_bin2','Age_bin3','Age_bin4','Age_bin5'))

2.7 SibSp and Parch. To be dropped. Used  to create new  Family_type feature.

In [None]:
def Cal_Family_bin(cols):
    FamilyZize = cols[0] +cols[1]
    if FamilyZize == 0:
        return 'Alone'
    elif 1 <= FamilyZize <= 3:
        return 'Family'
    elif FamilyZize >= 4:
        return 'Big_family'

In [None]:
data['Family_type'] = data[['SibSp','Parch']].apply(Cal_Family_bin,axis=1)

2.8 Embarked. Set missing values.

In [None]:
data.loc[data['Embarked'].isnull(), 'Embarked'] = 'S'

2.9 Ticket and Cabin. To be dropped. Used to create new Passenger relationship Feature.



Two new Feature created.

Family_wit_FC_dead = Family member with a dead female/child in the family.

Family_wit_M_alive = Family member without males dead in the family.

Family_witout_Women= Male or child alone or family without women (female no child)

In [None]:
# Family relationship got from Ticked and Cabin features
#It can be used also Family_name plus Fare with same results that ticked.
# female or child no survive.
list1=data[((data['Sex']=='female') | (data['Age']<14)) & (data['Survived']==0) ]['Ticket'].tolist()
list2=data[((data['Sex']=='female') | (data['Age']<14)) & (data['Survived']==0) ]['Cabin'].tolist()
# male no child survive.
list3=data[(data['Sex']=='male') & (data['Age']>14) & (data['Survived']==1)]['Ticket'].tolist()
list4=data[(data['Sex']=='male') & (data['Age']>14) & (data['Survived']==1)]['Cabin'].tolist()

In [None]:
def FC_dead(row):
    if ( (row['Ticket'] in list1) or (row['Cabin'] in list2) and (str(row['Cabin']) != 'nan')):
        return 1
    else:
        return 0

In [None]:
def M_Alive(row):
    if ( (row['Ticket'] in list3) or (row['Cabin'] in list4) and (str(row['Cabin']) != 'nan')):
        return 1
    else:
        return 0

In [None]:
def F_No_Woman(row):
    F_W = data[(data['Ticket'] == row['Ticket']) & (data['Sex']=='female') & (data['Age']>16)]['Sex'].count()
    if F_W == 0:
        return 1
    else:
        return 0

In [None]:
data['Family_wit_FC_dead']=data.apply(FC_dead, axis=1)
data['Family_wit_M_alive']=data.apply(M_Alive, axis=1)
data['Family_witout_Women']=data.apply(F_No_Woman, axis=1)

### 3. Converting Categorical Features

In [None]:
Fare_bin = pd.get_dummies(data['Fare_bin'])
Pclass_bin = pd.get_dummies(data['Pclass'],prefix ='Class')
Title_bin = pd.get_dummies(data['Title_bin'])
Sex_bin = pd.get_dummies(data['Sex'],drop_first=True,prefix ='Sex')
Age_bin = pd.get_dummies(data['Age_bin'])
Family_type = pd.get_dummies(data['Family_type'])
Embarked_bin = pd.get_dummies(data['Embarked'],prefix ='Embarked')
Family_wit_FC_dead=data['Family_wit_FC_dead'].astype(np.uint8)
Family_wit_M_alive=data['Family_wit_M_alive'].astype(np.uint8)
Family_witout_Women=data['Family_witout_Women'].astype(np.uint8)

In [None]:
data_cleaned = pd.concat([data['Survived'],Fare_bin,Pclass_bin,Title_bin,Sex_bin,Age_bin,Family_type,Embarked_bin,Family_wit_FC_dead,Family_wit_M_alive,Family_witout_Women],axis=1)

In [None]:
train_cleaned = data_cleaned[data['Survived'].notnull()]
test_cleaned = data_cleaned[data['Survived'].isnull()]

In [None]:
test_cleaned.drop('Survived',axis=1,inplace=True)
PassId =test['PassengerId']

### 4. Building the model

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_cleaned.drop('Survived',axis=1), 
                                                    train_cleaned['Survived'], test_size=0.30, 
                                                    random_state=64)

## Logistic Regression

In [None]:
logmodel = LogisticRegression(solver='lbfgs')
logmodel.fit(X_train,y_train)

In [None]:
log_predictions = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test,log_predictions))

In [None]:
acc_logreg = round(accuracy_score(log_predictions, y_test) * 100, 2)
print(acc_logreg)

In [None]:
##Log_predictions_test = logmodel.predict(test_cleaned).astype(np.uint8) ##-->0.81339

In [None]:
##output_log = pd.DataFrame({ 'PassengerId' : PassId, 'Survived': Log_predictions_test })
##output_log.to_csv('submission-log.csv', index=False)

## Decision Trees

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [None]:
dtree_predictions = dtree.predict(X_test)

In [None]:
print(classification_report(y_test,dtree_predictions))

In [None]:
print(confusion_matrix(y_test,dtree_predictions))

In [None]:
acc_logreg = round(accuracy_score(dtree_predictions, y_test) * 100, 2)
print(acc_logreg)

In [None]:
##dtree_predictions_test = dtree.predict(test_cleaned).astype(np.uint8) -->0.80861

In [None]:
##output_dtree = pd.DataFrame({ 'PassengerId' : PassId, 'Survived': dtree_predictions_test })
##output_dtree.to_csv('submission-dtree.csv', index=False)

## Random Forests

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

In [None]:
rfc_predictions = rfc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,rfc_predictions))

In [None]:
print(classification_report(y_test,rfc_predictions))

In [None]:
acc_logreg = round(accuracy_score(rfc_predictions, y_test) * 100, 2)
print(acc_logreg)

In [None]:
##rfc_predictions_test = rfc.predict(test_cleaned).astype(np.uint8) -->0.80382

In [None]:
##output_rfc = pd.DataFrame({ 'PassengerId' : PassId, 'Survived': rfc_predictions_test })
##output_rfc.to_csv('submission-rfc.csv', index=False)

# Train the Support Vector Classifier

In [None]:
model = SVC()
model.fit(X_train,y_train)

In [None]:
SVC_predictions = model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,SVC_predictions))
print(classification_report(y_test,SVC_predictions))

### Gridsearch

In [None]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [None]:
# May take awhile!
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid_predictions = grid.predict(X_test)

In [None]:
print(confusion_matrix(y_test,grid_predictions))

In [None]:
print(classification_report(y_test,grid_predictions))

In [None]:
acc_logreg = round(accuracy_score(SVC_predictions, y_test) * 100, 2)
print(acc_logreg)

In [None]:
SVC_predictions_test = grid.predict(test_cleaned).astype(np.uint8)

In [None]:
output_SVC = pd.DataFrame({ 'PassengerId' : PassId, 'Survived': SVC_predictions_test })
output_SVC.to_csv('submission-SVC2.csv', index=False)