In [1]:
# Loading data

import numpy as np
import pandas as pd

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [2]:
# Exploring data

y_train = train.Survived

X_train = train.drop(['Survived','PassengerId','Ticket'], 1)
X_test = test.drop(['PassengerId','Ticket'], 1)

#Types of data
print(X_train.dtypes,'\n')

#Procentage of survived people in train dataset
print('Survived', float(sum(y_train))/float(len(y_train)), '\n')

print('X_train.shape = ', X_train.shape)

Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked     object
dtype: object 

Survived 0.3838383838383838 

X_train.shape =  (891, 9)


In [3]:
# Decoding names

def decodeTitle(name, sex):    
    if name.find('Mr.') != -1:
        return 'Mr.'
    if name.find('Mrs.') != -1:
        return 'Mrs.'
    if name.find('Miss.') != -1:
        return 'Miss.'
    if sex == 'male':
        return 'Mr.'
    if sex == 'female':
        return 'Miss.'    
    #in test data 125 Mrs., 182 Miss, 517 Mr.    

title_train = []
title_test = []

for i in range(X_train.shape[0]):    
    title_train.append(decodeTitle(X_train.Name[i], X_train.Sex[i]))
for i in range(X_test.shape[0]):    
    title_test.append(decodeTitle(X_test.Name[i], X_test.Sex[i]))

In [4]:
# Decodng cabins

def decodeCabin(name):
    if pd.notnull(name):
        cabins = name.split(' ')
        averageCabin = 0.0
        for cabin in cabins:
            if cabin[1:] != '':
                averageCabin = averageCabin + float(cabin[1:]) / float(len(cabins))
            else:
                averageCabin = np.nan
        return cabins[0][0], averageCabin
    else:
        return 'X', np.nan

cabinSym_train = []
cabinNum_train = []
cabinSym_test = []
cabinNum_test = []
    
for i in range(X_train.shape[0]):
    cabin = X_train.Cabin[i]
    symbol, number = decodeCabin(cabin)
    cabinSym_train.append(symbol)
    cabinNum_train.append(number)

for i in range(X_test.shape[0]):
    cabin = X_test.Cabin[i]
    symbol, number = decodeCabin(cabin)
    cabinSym_test.append(symbol)
    cabinNum_test.append(number)

In [5]:
# Putting decoding data into table

X_train = X_train.join(pd.DataFrame({'CabinSymbol': cabinSym_train}))
X_train = X_train.join(pd.DataFrame({'CabinNumber': cabinNum_train}))
X_train = X_train.join(pd.DataFrame({'Title': title_train}))
X_train = X_train.drop(['Name', 'Cabin'], 1)

X_test = X_test.join(pd.DataFrame({'CabinSymbol': cabinSym_test}))
X_test = X_test.join(pd.DataFrame({'CabinNumber': cabinNum_test}))
X_test = X_test.join(pd.DataFrame({'Title': title_test}))
X_test = X_test.drop(['Name', 'Cabin'], 1)

In [6]:
# Transforming data

numeric_cols = ['Age', 'Fare', 'CabinNumber']
categorial_cols = list(set(X_train.columns.values.tolist()) - set(numeric_cols))
X_num = X_train.drop(categorial_cols, 1)
X_cat = X_train.drop(numeric_cols, 1)
X_test_num = X_test.drop(categorial_cols, 1)
X_test_cat = X_test.drop(numeric_cols, 1)

print('X_cat.shape', X_cat.shape)
print('X_num.shape', X_num.shape)

print('X_cat.isnull =', X_cat.isnull().values.any())
print('X_num.isnull =', X_num.isnull().values.any())
print('X_test_cat.isnull =', X_test_cat.isnull().values.any())
print('X_test_num.isnull =', X_test_num.isnull().values.any())

X_cat.shape (891, 7)
X_num.shape (891, 3)
X_cat.isnull = True
X_num.isnull = True
X_test_cat.isnull = False
X_test_num.isnull = True


In [7]:
# Getting rid of missign values in numeric columns

missingAge = 0#np.mean(X_num['Age'])
missingFare = 0#np.mean(X_num['Fare'])
missingCabinNumber = 0#np.mean(X_num['CabinNumber'])

X_num['Age'] = X_num['Age'].fillna(missingAge)
X_num['Fare'] = X_num['Fare'].fillna(missingFare)
X_num['CabinNumber'] = X_num['CabinNumber'].fillna(missingCabinNumber)

X_test_num['Age'] = X_test_num['Age'].fillna(missingAge)
X_test_num['Fare'] = X_test_num['Fare'].fillna(missingFare)
X_test_num['CabinNumber'] = X_test_num['CabinNumber'].fillna(missingCabinNumber)

print('Missing Age Value =', missingAge)
print('Missing Fare Value =', missingFare)
print('Missing Cabin Number Value =', missingCabinNumber)

# Getting rid of missing values in categorial data
X_cat = X_cat.fillna('NA')

print('X_num.isnull =', X_num.isnull().values.any())
print('X_cat.isnull =', X_cat.isnull().values.any())
print('X_test_cat.isnull =', X_test_cat.isnull().values.any())
print('X_test_num.isnull =', X_test_num.isnull().values.any())

Missing Age Value = 0
Missing Fare Value = 0
Missing Cabin Number Value = 0
X_num.isnull = False
X_cat.isnull = False
X_test_cat.isnull = False
X_test_num.isnull = False


In [8]:
# Scalling numeriacal data

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_num_sc = scaler.fit_transform(X_num)
X_test_num_sc = scaler.transform(X_test_num)
print(X_num_sc.shape, '\n')
print(X_num_sc, '\n')

(891, 3) 

[[-0.10231279 -0.50244517 -0.41461294]
 [ 0.80749164  0.78684529  2.77149748]
 [ 0.12513832 -0.48885426 -0.41461294]
 ..., 
 [-1.35329389 -0.17626324 -0.41461294]
 [ 0.12513832 -0.04438104  5.13296756]
 [ 0.46631498 -0.49237783 -0.41461294]] 



In [9]:
print ('Number of Mrs.', sum([X_train.Title == 'Mrs.'][0]))
print ('Number of Miss.', sum([X_train.Title == 'Miss.'][0]))
print ('Number of Mr.', sum([X_train.Title == 'Mr.'][0]))
print ('Number of unknown cabin', sum([X_train.CabinSymbol == 'X'][0]))
print ('Passangers in train dataset is',X_train.shape[0])

Number of Mrs. 125
Number of Miss. 189
Number of Mr. 577
Number of unknown cabin 687
Passangers in train dataset is 891


In [10]:
# Encoding categorial data

from sklearn.feature_extraction import DictVectorizer

encoder = DictVectorizer(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())
X_test_cat_oh = encoder.transform(X_test_cat.T.to_dict().values())

print (X_cat_oh.shape, '\n')
print (X_cat_oh, '\n')

(891, 21) 

[[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  1. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  1. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]] 



In [17]:
#Features selection through logistic classifier with l1 penalty

from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

X_shuffled, y_shuffled = shuffle(np.hstack([X_num_sc, X_cat_oh]), y_train, random_state = 118)
print('X_shuffled.shape =', X_shuffled.shape)

optimizer = GridSearchCV(LogisticRegression(penalty='l1'),\
                         {'C' : [0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0]}, \
                         cv = 30)
optimizer.fit(X_shuffled, y_shuffled)
print('Logistic Classifier Best Result', optimizer.best_score_)
print('Best Params', optimizer.best_params_)
print('Best Weights', optimizer.best_estimator_.coef_)
print(numeric_cols, encoder.feature_names_)

X_shuffled.shape = (891, 24)
Logistic Classifier Best Result 0.79012345679
Best Params {'C': 0.5}
Best Weights [[-0.30075128  0.13655121  0.02764731  0.          0.         -0.21001124
   0.20889299  0.62232771  0.10281364  0.          0.         -0.4445567
   0.3336803   0.          0.          0.         -0.06721765 -0.67143394
   2.50103768  0.         -0.25058911  0.          0.          0.40299366]]
['Age', 'Fare', 'CabinNumber'] ['CabinSymbol=A', 'CabinSymbol=B', 'CabinSymbol=C', 'CabinSymbol=D', 'CabinSymbol=E', 'CabinSymbol=F', 'CabinSymbol=G', 'CabinSymbol=T', 'CabinSymbol=X', 'Embarked=C', 'Embarked=NA', 'Embarked=Q', 'Embarked=S', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp', 'Title=Miss.', 'Title=Mr.', 'Title=Mrs.']


In [19]:
for i in range(X_cat_oh.shape[1]):
    print(encoder.feature_names_[i], optimizer.best_estimator_.coef_[0][i + X_num_sc.shape[1]])

CabinSymbol=A 0.0
CabinSymbol=B 0.0
CabinSymbol=C -0.210011235546
CabinSymbol=D 0.208892985769
CabinSymbol=E 0.622327710605
CabinSymbol=F 0.102813642342
CabinSymbol=G 0.0
CabinSymbol=T 0.0
CabinSymbol=X -0.444556695321
Embarked=C 0.333680300048
Embarked=NA 0.0
Embarked=Q 0.0
Embarked=S 0.0
Parch -0.0672176538028
Pclass -0.671433937093
Sex=female 2.50103767978
Sex=male 0.0
SibSp -0.250589112457
Title=Miss. 0.0
Title=Mr. 0.0
Title=Mrs. 0.402993661805


In [20]:
def writeAnswer(filename, optimizer):
    prediction = optimizer.predict(np.hstack([X_test_num, X_test_cat_oh]))
    submission = pd.DataFrame({
            "PassengerId": test["PassengerId"],
            "Survived": prediction
        })
    submission.to_csv(filename, index=False)
    
writeAnswer('LogisticRegressionClassifier.csv', optimizer)