In [236]:
import numpy as np
import pandas as pd
import sklearn.linear_model as sk_lin
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

%matplotlib notebook

#### Take a look at the data

In [237]:
df = pd.read_csv("train.csv")


###### Are there any outliers in the scaler fields that we have?
The Age field look fine, but the Fare field evidences a large number of "outliers" with three in particular being twice the size of their next largest neighbour. I investigated these data points in order to understand if they are a problem in the data, or whether they are real and researching the names of the passengers, revealed that those particular passengers had indeed been staying in the most luxurious suite on the ship, hence the expensive fare.

#### Drop fields we don't need.
Dropped passenger Ids and names as those are irrelevant for the purpose of Logistic Regression. Also dropped cabin. Extracting the deck on which the passenger was housed might be relevant, however the data in this column is quite sparse and inconsistent, so it is not clear how to impute the missing values.

In [238]:
df = df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
df.head()



Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


#### Some data cleaning
The Sex field is mapped to 0 and 1 while Age, SibSp, Parch and Fare fields are normalized ((x-min(x))/span(x)). PClass and embarked, being multi-class fields are split into |classes|-1 binary fields (one-hot encoding). Also any NaN fields are imputed to 0.

In [245]:
# Convert Sex field to 1/0
df = df.fillna(0)

sexmap = {"male":1, "female":0}
df["Sex"] = df["Sex"].map(sexmap)

# Normalize Age
#span = np.max(df["Age"]) - np.min(df["Age"])

#df["Age"] = (df["Age"]-np.min(df["Age"]))/span

# Tried joining SibSp and Parch into a single "Family" field, but results in lower accuracy
#df["Family"] = df["SibSp"]+df["Parch"]
#span = np.max(df["Family"]) - np.min(df["Family"])
#df["Family"] = (df["Family"]-np.min(df["Family"]))/span

scaler = preprocessing.StandardScaler().fit(df[["SibSp", "Parch", "Fare", "Age"]])
df.loc[:,["SibSp", "Parch", "Fare", "Age"]] = scaler.transform(df[["SibSp", "Parch", "Fare", "Age"]])
# Normalize SibSp
#span = np.max(df["SibSp"]) - np.min(df["SibSp"])
#
#df["SibSp"] = (df["SibSp"]-np.min(df["SibSp"]))/span
#
## Normalize Parch
#span = np.max(df["Parch"]) - np.min(df["Parch"])
#
#df["Parch"] = (df["Parch"]-np.min(df["Parch"]))/span
#
## Normalize Fare
#span = np.max(df["Fare"]) - np.min(df["Fare"])
#
#df["Fare"] = (df["Fare"]-np.min(df["Fare"]))/span



# Encode categorical values in 1-hot encoding
#pclass_1hot = pd.get_dummies(df["Pclass"])
#pclass_1hot.columns = ["pClass_"+str(pclass_1hot.columns[i]) for i in range(0,len(pclass_1hot.columns))]
#df=df.join(pclass_1hot.iloc[:,0:-1])

enc = preprocessing.OneHotEncoder()
enc.fit(df[["Pclass"]]);

df.describe()

# Commented out as the "embarcation port" does not seem to make any difference to the accuracy
#embarked_1hot = pd.get_dummies(df["Embarked"])
#embarked_1hot.columns = ["Embarked_"+str(embarked_1hot.columns[i]) for i in range(0, len(embarked_1hot.columns))]
#df=df.join(embarked_1hot.iloc[:,0:-1])

df = df.drop(["Pclass", "Embarked"], axis=1)



# Drop outliers in Fare -> Commented out as results in worse accuracy
#q75, q25 = np.percentile(df["Fare"], [75 ,25])
#iqr = q75 - q25
#df = df[df["Fare"] <= iqr*3]



TypeError: fit() takes at least 2 arguments (1 given)

#### The new shape of the data set...

In [240]:
df.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,pClass_1,pClass_2
0,0,1,-0.102313,0.432793,-0.473674,-0.502445,0,0
1,1,0,0.807492,0.432793,-0.473674,0.786845,1,0
2,1,0,0.125138,-0.474545,-0.473674,-0.488854,0,0
3,1,0,0.636903,0.432793,-0.473674,0.42073,1,0
4,0,1,0.636903,-0.474545,-0.473674,-0.486337,0,0


## Set up Logistic Regression with skLearn
Set up training and testing sets

In [241]:
Y = df["Survived"].values
X=df.iloc[:,1:]
## Split df into test and train datasets (70%/30%)
#
#splitNdx = int(df.shape[0]*0.7)
#X_train = np.matrix(df.iloc[0:splitNdx,1:].values)
#Y_train = df.iloc[0:splitNdx, 0].values
#X_test = np.matrix(df.iloc[splitNdx:, 1:].values)
#Y_test = df.iloc[splitNdx:, 0].values

#print X_train.shape
#print X_test.shape


In [242]:
logReg = sk_lin.LogisticRegression(penalty="l1", C=0.1)

#logReg.fit(X_train, Y_train)

In [243]:
scores = cross_val_score(logReg, X, Y, cv=10)
scores
#Y_hat = logReg.predict(X_test)

array([ 0.8       ,  0.77777778,  0.7752809 ,  0.84269663,  0.7752809 ,
        0.76404494,  0.7752809 ,  0.76404494,  0.83146067,  0.78409091])

In [244]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#comp = Y_hat == Y_test

#print "Accracy=", (float(np.sum(comp.astype(int)))/len(comp))*100, "%"

Accuracy: 0.79 (+/- 0.05)
