In [1]:
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV

In [2]:
# RESULTADOS
def resultados(labelsTeste, predicoes):
    acc = metrics.accuracy_score(labelsTeste, predicoes)
    fscore = metrics.f1_score(labelsTeste, predicoes, average='macro')
    prec = metrics.precision_score(labelsTeste, predicoes, average='macro')
    recall = metrics.recall_score(labelsTeste, predicoes, average='macro')
    return [acc, fscore, prec, recall]

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
from keras.utils import to_categorical
original_train = pd.read_csv('/kaggle/input/titanic/train.csv')
original_test = pd.read_csv('/kaggle/input/titanic/test.csv')
label_test = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [5]:
label_train = original_train[['PassengerId','Survived']]
data_train = original_train.drop(['Survived'], axis=1)

data_test = original_test

data_train

In [6]:
# Verifica quantidade de NaN values por coluna
data_train.isna().sum()

In [7]:
# Verifica quantidade de NaN values por coluna
data_test.isna().sum()

In [8]:
# Mining PCLASS
class_train = pd.get_dummies(data_train['Pclass'], prefix='class', drop_first=True)
data_train = pd.concat([data_train,class_train], axis=1).drop(['Pclass'], axis=1)

class_test = pd.get_dummies(data_test['Pclass'], prefix='class', drop_first=True)
data_test = pd.concat([data_test,class_test], axis=1).drop(['Pclass'], axis=1)

data_train

In [9]:
# Mining SEX
sex_train = pd.get_dummies(data_train['Sex'], prefix='sex', drop_first=True)
data_train = pd.concat([data_train,sex_train], axis=1).drop(['Sex'], axis=1)

sex_test = pd.get_dummies(data_test['Sex'], prefix='sex', drop_first=True)
data_test = pd.concat([data_test,sex_test], axis=1).drop(['Sex'], axis=1)

data_train

In [10]:
data_train['FamilySize'] = data_train['SibSp'] + data_train['Parch'] + 1

# Miningn SIBSP and PARCH
data_train['FamilySize'] = np.where(data_train['FamilySize'] > 4, 'Big',\
                                 np.where(data_train['FamilySize'] > 1, 'Small', 'Single'))
family_train = pd.get_dummies(data_train['FamilySize'], prefix='FamilySize', drop_first=True)
data_train = pd.concat([data_train,family_train], axis=1).drop(['FamilySize','SibSp','Parch'], axis=1)


data_test['FamilySize'] = data_test['SibSp'] + data_test['Parch'] + 1
data_test['FamilySize'] = np.where(data_test['FamilySize'] > 4, 'Big',\
                                 np.where(data_test['FamilySize'] > 1, 'Small', 'Single'))
family_test = pd.get_dummies(data_test['FamilySize'], prefix='FamilySize', drop_first=True)
data_test = pd.concat([data_test,family_test], axis=1).drop(['FamilySize','SibSp','Parch'], axis=1)

data_train

In [11]:
# Fill Embarked nan-values with MODE
data_train['Embarked'] = data_train['Embarked'].fillna('S')

In [12]:
# Mining EMBARKED
embark_train = pd.get_dummies(data_train['Embarked'], prefix='embark', drop_first=True)
data_train = pd.concat([data_train,embark_train], axis=1).drop(['Embarked'], axis=1)

embark_test = pd.get_dummies(data_test['Embarked'], prefix='embark', drop_first=True)
data_test = pd.concat([data_test,embark_test], axis=1).drop(['Embarked'], axis=1)

data_train

In [13]:
data_train.isna().sum()

In [14]:
# Minerando Fare para base de treino
data_train['FareMean'] = np.array(np.floor(np.array(data_train['Fare']) / 10.))
data_train = data_train.drop(['Fare'], axis=1)

# Minerando Fare para base de treino
data_test['FareMean'] = np.array(np.floor(np.array(data_test['Fare']) / 10.))
data_test['FareMean'] = data_test['FareMean'].fillna(data_test['FareMean'].median())
data_test = data_test.drop(['Fare'], axis=1)

data_train

In [15]:
# Minerando Name para base de treino
data_train['NameTitle'] = data_train['Name'].astype(str).str.split(',').str[1] # get title with space and dot
data_train['NameTitle'] = data_train['NameTitle'].astype(str).str.split('.').str[0] # get title with space
data_train['NameTitle'] = data_train['NameTitle'].astype(str).str[1:] # get title only
data_train['NameTitle'] = data_train['NameTitle'].replace(['Don','Dona','Rev','Dr','Mme','Ms','Major','Lady','Sir','Mlle','Col','Capt','the Countess','Jonkheer'], 'Others')
data_train = data_train.drop(['Name'], axis=1)

# Minerando Name para base de teste
data_test['NameTitle'] = data_test['Name'].astype(str).str.split(',').str[1]
data_test['NameTitle'] = data_test['NameTitle'].astype(str).str.split('.').str[0] # get title with space
data_test['NameTitle'] = data_test['NameTitle'].astype(str).str[1:] # get title only
data_test['NameTitle'] = data_test['NameTitle'].replace(['an','Don','Dona','Rev','Dr','Mme','Ms','Major','Lady','Sir','Mlle','Col','Capt','the Countess','Jonkheer'], 'Others')
data_test = data_test.drop(['Name'], axis=1)

data_train

In [16]:
data_test.NameTitle.unique()

In [17]:
# Mining NAMETITLE
title_train = pd.get_dummies(data_train['NameTitle'], prefix='title', drop_first=True)
data_train = pd.concat([data_train,title_train], axis=1).drop(['NameTitle'], axis=1)

title_test = pd.get_dummies(data_test['NameTitle'], prefix='title', drop_first=True)
data_test = pd.concat([data_test,title_test], axis=1).drop(['NameTitle'], axis=1)

data_train

In [18]:
# Mining AGE
data_train['Age'] = data_train['Age'].fillna(data_train['Age'].median())

data_test['Age'] = data_test['Age'].fillna(data_test['Age'].median())

data_train

In [19]:
# Dropping PASSENGERID, TICKET and CABIN (cabin too much missing values)

data_train = data_train.drop(['PassengerId','Ticket','Cabin'], axis=1)

data_test = data_test.drop(['PassengerId','Ticket','Cabin'], axis=1)

data_train

In [20]:
data_train.isna().sum()

In [21]:
data_test.isna().sum()

In [22]:
data_train.dtypes.sample(len(data_train.columns))

In [23]:
# List of features for later use
feature_list = list(data_train.columns)
feature_list

In [24]:
#Separar os dados dos labels

train_X = np.array(data_train) #train data
train_Y = np.array(label_train.iloc[:, 1]) #train labels

test_X = np.array(data_test) #test data
test_Y = np.array(label_test.iloc[:, 1]) #test labels

In [25]:
print('Training data shape : ', train_X.shape, train_Y.shape)
print('Testing data shape : ', test_X.shape, test_Y.shape)

In [26]:
# Verify Nan Values in an array
np.isnan(sum(train_X))

In [27]:
# Verify Nan Values in an array
np.isnan(sum(test_X))

In [28]:
# Encontrar o número de classes existentes no conjunto de treino
classes = np.unique(train_Y)
nClasses = len(classes)
print('Total number of outputs : ', nClasses)
print('Output classes : ', classes)

In [29]:
#IMPORTANDO SKLEARN E SEPARANDO TREINO/TESTE PARA O MODELO

dataNoLabels = train_X
labels = train_Y

train_data,valid_data,train_label,valid_label = train_test_split(dataNoLabels, labels, test_size=0.1, random_state=13)

np.size(labels)

In [30]:
# Param Grid for SVC
param_grid = { 
    'kernel': ['rbf', 'linear'],
    'C' : [1,10,100,1000],
    'gamma' :[1, 1e-1, 1e-3, 1e-4]
}
clf = SVC()

In [31]:
#CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 3) # search with CV
#CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, refit = True, verbose=2) # search without CV
#CV_clf.fit(train_data, np.ravel(train_label,order='C'))
#CV_clf.best_params_

In [32]:
# Treina e valida o modelo
clf = SVC(C=10000, gamma=0.0001, kernel='rbf').fit(train_data, np.ravel(train_label,order='C')) #train SVC

predicoes = cross_val_predict(clf, valid_data, valid_label, cv=5)

acc, f1s, precisao, recall = resultados(valid_label, predicoes) #show results
print("\nEVALUATION METRICS\n"
        "acc: %0.4f - fscore: %0.4f - prec: %0.4f - recall: %0.4f\n" % (acc, f1s, precisao, recall))

In [33]:
# --------- TESTE --------- #

In [34]:
predicoes_teste = clf.predict(test_X) # Original Classifier and Original test data

acc, f1s, precisao, recall = resultados(test_Y, predicoes_teste) #show results
print("\nEVALUATION METRICS\n"
        "acc: %0.4f - fscore: %0.4f - prec: %0.4f - recall: %0.4f\n" % (acc, f1s, precisao, recall))

In [35]:
df_pass = pd.DataFrame(original_test['PassengerId'])
df_pred = pd.DataFrame(predicoes_teste, columns=['Survived'])
df_result = pd.concat([df_pass['PassengerId'], df_pred['Survived']], axis=1, keys=['PassengerId', 'Survived'])
df_result

In [36]:
df_result.to_csv('titanic.csv',index=False)