In [1]:
#classifiers
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
#preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

#evaluation
from sklearn.model_selection import cross_val_score
from scipy.stats.stats import pearsonr

In [2]:
#load data
train = pd.read_csv("../input/train.csv", dtype={"Age": np.float64}, )
test = pd.read_csv("../input/test.csv", dtype={"Age": np.float64}, )

In [3]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [5]:
#dealing with missing values in 'Age'
train.Age.fillna(train.Age.mean(), inplace=True)

#dealing with missing values in 'embarked'
#print(len(X.Embarked))
train.Embarked.fillna('S', inplace=True)

In [6]:
#label encoding for gender
le = preprocessing.LabelEncoder()
le.fit(pd.unique(train.Sex))
#le.classes_ for printing distinct classes
sex_t = le.transform(train.Sex)
test_sex_t = le.transform(test.Sex)

#label encoding for embarked
le = preprocessing.LabelEncoder()
le.fit(pd.unique(train.Embarked))
embarked_t = le.transform(train.Embarked)
test_embarked_t = le.transform(test.Embarked)

In [7]:
Y = train.iloc[:,1]
train = train.iloc[:,[2,4,5,6,7,9,11]]
test = test.iloc[:,[1,3,4,5,6,8,10]]


train.iloc[:,1] = sex_t
test.iloc[:,1] = test_sex_t

train.iloc[:,6] = embarked_t
test.iloc[:,6] = test_embarked_t


print(train.head(3))
print(test.head(3))

   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0       3    1  22.0      1      0   7.2500         2
1       1    0  38.0      1      0  71.2833         0
2       3    0  26.0      0      0   7.9250         2
   Pclass  Sex   Age  SibSp  Parch    Fare  Embarked
0       3    1  34.5      0      0  7.8292         1
1       3    0  47.0      1      0  7.0000         2
2       2    1  62.0      0      0  9.6875         1


In [8]:
#normalize/scale data
train = train.fillna(method='ffill')
test = test.fillna(method='ffill')
scaler = StandardScaler()
scaler.fit(train.values)
X_train = scaler.transform(train)
X_test = scaler.transform(test)

In [9]:
# pearson's coefficient
pclassco = pearsonr(Y, X_train[:,0])
sexco = pearsonr(Y, X_train[:,1])
ageco = pearsonr(Y, X_train[:,2])
sibspco = pearsonr(Y, X_train[:,3])
parchco = pearsonr(Y, X_train[:,4])
embarkedco = pearsonr(Y, X_train[:,5])
family = pearsonr(Y,X_train[:,3] + X_train[:,4])
print(pclassco)
print(sexco)
print(ageco)
print(sibspco)
print(parchco)
print(embarkedco)
print(family)

(-0.33848103596101531, 2.5370473879804202e-25)
(-0.54335138065775512, 1.4060661308795969e-69)
(-0.069808515287143144, 0.03721708372683364)
(-0.035322498885735562, 0.29224392869817906)
(0.081629407083483485, 0.0147992453747224)
(0.25730652238496243, 6.1201893419218733e-15)
(0.027528178502442997, 0.41181275255354965)


In [10]:
#svm with grid search
svm = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':(1,0.25,0.5,0.75),'gamma': (1,2,3,'auto'),'decision_function_shape':('ovo','ovr'),'shrinking':(True,False)}
clf = GridSearchCV(svm, parameters)
clf.fit(X_train,Y)
print("accuracy:"+str(np.average(cross_val_score(clf, X_train, Y, scoring='accuracy'))))
print("f1:"+str(np.average(cross_val_score(clf, X_train, Y, scoring='f1'))))


accuracy:0.820426487093
f1:0.747193530852
