In [115]:
import os
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

base_path = './'
train_filepath = base_path + 'train.csv'
test_filepath = base_path + 'test.csv'

if os.path.exists(train_filepath):
    print('Loading data....')
    train_data = pd.read_csv(train_filepath)
    test_data = pd.read_csv(test_filepath)
    print("\n\nShow head data")
    print(train_data.head())
    print("\n\nDescribe data")
    print(train_data.describe())
    
    print("Nulls in training set")
    cols_with_missing = ['{} {} nullos de {}'.format(col,train_data[col].isnull().sum(),train_data[col].isnull().count())
                         for col in train_data.columns
                         if train_data[col].isnull().any()]
    print("\n\n")
    print(cols_with_missing)
    
    print("Nulls in submission set")
    cols_with_missing = ['{} {} nullos de {}'.format(col,test_data[col].isnull().sum(),test_data[col].isnull().count())
                         for col in test_data.columns
                         if test_data[col].isnull().any()]
    print("\n\n")
    print(cols_with_missing)
    
    
    print('\n\nSetup and Data Loaded Complete')
else:
    print('Files not exists')

Loading data....


Show head data
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            3

## Column Selection
I delete cabin column because has many null values

In [117]:
X_data = train_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
y_data = train_data['Survived']
X_send_data = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
X_passenger_id = test_data['PassengerId'].copy()

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,test_size=1/5,random_state=0)

print('\nX data')
print(X_data[:10])


print('\n\nY data')
print(y_data[:10])

print('\nFinished loaded...')


X data
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0       3    male  22.0      1      0   7.2500        S
1       1  female  38.0      1      0  71.2833        C
2       3  female  26.0      0      0   7.9250        S
3       1  female  35.0      1      0  53.1000        S
4       3    male  35.0      0      0   8.0500        S
5       3    male   NaN      0      0   8.4583        Q
6       1    male  54.0      0      0  51.8625        S
7       3    male   2.0      3      1  21.0750        S
8       3  female  27.0      0      2  11.1333        S
9       2  female  14.0      1      0  30.0708        C


Y data
0    0
1    1
2    1
3    1
4    0
5    0
6    0
7    0
8    1
9    1
Name: Survived, dtype: int64

Finished loaded...


## Column Nomalization
 I fill age with average

In [118]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer = imputer.fit(X_train['Age'].values.reshape(-1, 1))

X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
X_send = X_send_data.copy()

X_train_norm.loc[:,'Age'] = imputer.transform(X_train['Age'].values.reshape(-1, 1))
X_test_norm.loc[:,'Age'] = imputer.transform(X_test['Age'].values.reshape(-1, 1))
X_send.loc[:,'Age'] = imputer.transform(X_send['Age'].values.reshape(-1, 1))

I delete 2 rows of embarked with null values


In [119]:
mask = ~pd.isnull(X_train).any(axis=1)
X_train_norm = X_train[mask]
y_train = y_train[mask]

mask = ~pd.isnull(X_test).any(axis=1)
X_test_norm = X_test[mask]
y_test = y_test[mask] 

I fill with the mean the Fare row in submission data

In [121]:
from sklearn.impute import SimpleImputer

imputerFare = SimpleImputer(missing_values=np.nan,strategy='mean')
imputerFare = imputerFare.fit(X_train['Fare'].values.reshape(-1, 1))

X_send.loc[:,'Fare'] = imputerFare.transform(X_send['Fare'].values.reshape(-1, 1))

I applied one hot encoder to sex and embarked

In [122]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
object_cols = ['Sex','Embarked']
columns = ['']
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train_norm[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test_norm[object_cols]))
OH_cols_send = pd.DataFrame(OH_encoder.transform(X_send[object_cols]))


# One-hot encoding removed index; put it back
OH_cols_train.index = X_train_norm.index
OH_cols_test.index = X_test_norm.index
OH_cols_send.index = X_send.index


columns = []
column = ''
columns_to_drop = []
for encoder in OH_encoder.categories_:
    for column in encoder:
        columns.append('is_{}'.format(column))
    columns_to_drop.append('is_{}'.format(column))
    
OH_cols_train.columns = columns
OH_cols_test.columns = columns
OH_cols_send.columns = columns

OH_cols_train = OH_cols_train.drop(columns_to_drop,axis=1)
OH_cols_test =OH_cols_test.drop(columns_to_drop,axis=1)
OH_cols_send =OH_cols_send.drop(columns_to_drop,axis=1)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train_norm = X_train_norm.drop(object_cols, axis=1)
num_X_test_norm = X_test_norm.drop(object_cols, axis=1)
num_X_send = X_send.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train_norm, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test_norm, OH_cols_test], axis=1)
OH_X_send = pd.concat([num_X_send, OH_cols_send], axis=1)

print(OH_X_train)


     Pclass   Age  SibSp  Parch      Fare  is_female  is_C  is_Q
439       2  31.0      0      0   10.5000        0.0   0.0   0.0
817       2  31.0      1      1   37.0042        0.0   1.0   0.0
378       3  20.0      0      0    4.0125        0.0   1.0   0.0
491       3  21.0      0      0    7.2500        0.0   0.0   0.0
331       1  45.5      0      0   28.5000        0.0   0.0   0.0
..      ...   ...    ...    ...       ...        ...   ...   ...
763       1  36.0      1      2  120.0000        1.0   0.0   0.0
835       1  39.0      1      1   83.1583        1.0   1.0   0.0
192       3  19.0      1      0    7.8542        1.0   0.0   0.0
559       3  36.0      1      0   17.4000        1.0   0.0   0.0
684       2  60.0      1      1   39.0000        0.0   0.0   0.0

[569 rows x 8 columns]


In [123]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

X_trainf = sc_X.fit_transform(OH_X_train)
X_testf = sc_X.transform(OH_X_test)
X_sendf = sc_X.transform(OH_X_send)


print(X_trainf[:10])

print('Finished preprocessing')

[[-0.29244176  0.09104097 -0.54710127 -0.50182111 -0.43987676 -0.73054276
  -0.45895348 -0.20524264]
 [-0.29244176  0.09104097  0.55290093  0.65419594  0.05799956 -0.73054276
   2.17887006 -0.20524264]
 [ 0.90467594 -0.66419484 -0.54710127 -0.50182111 -0.56174321 -0.73054276
   2.17887006 -0.20524264]
 [ 0.90467594 -0.59553704 -0.54710127 -0.50182111 -0.50092739 -0.73054276
  -0.45895348 -0.20524264]
 [-1.48955945  1.08657907 -0.54710127 -0.50182111 -0.10175021 -0.73054276
  -0.45895348 -0.20524264]
 [ 0.90467594 -0.52687924 -0.54710127 -0.50182111 -0.48589954 -0.73054276
  -0.45895348 -0.20524264]
 [ 0.90467594 -0.25224803 -0.54710127 -0.50182111 -0.49106536 -0.73054276
  -0.45895348 -0.20524264]
 [-0.29244176 -0.32090584  0.55290093  0.65419594 -0.073573    1.36884527
  -0.45895348 -0.20524264]
 [ 0.90467594 -0.59553704  0.55290093 -0.50182111 -0.45255651  1.36884527
  -0.45895348 -0.20524264]
 [-1.48955945  0.09104097  0.55290093 -0.50182111  0.33969279 -0.73054276
  -0.45895348 -0.

## Model

### Linear SVC
Selecting best model Linear SVC

In [146]:
#Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [
    {'penalty':['l1'],'dual':[False],'C':[5.5],'max_iter':[213],'tol':np.arange(0.0001, 0.001, 0.0001).tolist()}
]
grid_search = GridSearchCV(estimator=LinearSVC(),
                         param_grid=parameters,
                         scoring='accuracy',
                         cv=100,
                         n_jobs=-1)
grid_search = grid_search.fit(X_trainf,y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("Best accuracy {}%".format(best_accuracy*100))
print("Best pameters \n {}".format(best_parameters))

Best accuracy 78.96666666666667%
Best pameters 
 {'C': 5.5, 'dual': False, 'max_iter': 213, 'penalty': 'l1', 'tol': 0.0007000000000000001}


#### Best model Linear SVC

Position 9846

Score 0.77511

Precision 78.96%

Variance 15.81%

TRAIN

True positives 292 -
False positives 48 -
False negative 69 -
True negative 160 -


TEST

True positives 69 -
False positives 15 -
False negative 13 -
True negative 46 -

In [150]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

best_model = LinearSVC(max_iter=213,dual=False, tol=0.0007, C=5.5,penalty='l1')
best_model.fit(X_trainf, y_train)

LinearSVC(C=5.5, dual=False, max_iter=213, penalty='l1', tol=0.0007)

## Test

In [148]:
from sklearn.metrics import confusion_matrix

y_predict = clf.predict(OH_X_train)
cm = confusion_matrix(y_train,y_predict)
print("TRAIN\nTrue positives {} \nFalse positives {} \nFalse negative {} \nTrue negative {}\n\n".format(cm[0][0],cm[0][1],cm[1][0],cm[1][1]))

y_predict_test = clf.predict(OH_X_test)
cm = confusion_matrix(y_test,y_predict_test)
print("TEST\nTrue positives {} \nFalse positives {} \nFalse negative {} \nTrue negative {}\n\n".format(cm[0][0],cm[0][1],cm[1][0],cm[1][1]))


#Applying k-Flod Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=clf, X=OH_X_train, y=y_train, cv=100)


print("Precision {}%".format(accuracies.mean()*100))
#Indica la varianza
print("Variance {}%".format(accuracies.std()*100))


TRAIN
True positives 292 
False positives 48 
False negative 69 
True negative 160


TEST
True positives 69 
False positives 15 
False negative 13 
True negative 46


Precision 78.96666666666667%
Variance 15.816271508938017%


## Submission

In [149]:
best_model.fit(X_testf,y_test)
y_send = best_model.predict(X_sendf)

data = {'PassengerId': X_passenger_id,
        'Survived': y_send}

df = pd.DataFrame (data)
df.to_csv('submission.csv',index=False)
