In [2]:
import pandas as pd
pd.options.display.max_columns = 100
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
def autolabel(arrayA):
    ''' label each colored square with the corresponding data value. 
    If value > 20, the text is in black, else in white.
    '''
    arrayA = np.array(arrayA)
    for i in range(arrayA.shape[0]):
        for j in range(arrayA.shape[1]):
                plt.text(j,i, "%.2f"%arrayA[i,j], ha='center', va='bottom',color='w')

def hist_it(feat):
    plt.figure(figsize=(16,4))
    feat[Y==0].hist(bins=range(int(feat.min()),int(feat.max()+2)),normed=True,alpha=0.8)
    feat[Y==1].hist(bins=range(int(feat.min()),int(feat.max()+2)),normed=True,alpha=0.5)
    plt.ylim((0,1))
    
def gt_matrix(feats,sz=16):
    a = []
    for i,c1 in enumerate(feats):
        b = [] 
        for j,c2 in enumerate(feats):
            mask = (~train[c1].isnull()) & (~train[c2].isnull())
            if i>=j:
                b.append((train.loc[mask,c1].values>=train.loc[mask,c2].values).mean())
            else:
                b.append((train.loc[mask,c1].values>train.loc[mask,c2].values).mean())

        a.append(b)

    plt.figure(figsize = (sz,sz))
    plt.imshow(a, interpolation = 'None')
    _ = plt.xticks(range(len(feats)),feats,rotation = 90)
    _ = plt.yticks(range(len(feats)),feats,rotation = 0)
    autolabel(a)

### Read the Data

In [2]:
train = pd.read_csv('data/train.csv',index_col = 0)
y = train.Survived
test = pd.read_csv('data/test.csv', index_col = 0)

In [3]:
train.columns = [c.lower() for c in train.columns]
test.columns = [c.lower() for c in test.columns]

### Data Overview

In [4]:
train.head(10)

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
print(train.shape)
print(test.shape)

(891, 11)
(418, 10)


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
survived    891 non-null int64
pclass      891 non-null int64
name        891 non-null object
sex         891 non-null object
age         714 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
ticket      891 non-null object
fare        891 non-null float64
cabin       204 non-null object
embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [7]:
#Check for NaN values first by row then by column
train.isnull().sum(axis=1).head(15)

PassengerId
1     1
2     0
3     1
4     0
5     1
6     2
7     0
8     1
9     1
10    1
11    0
12    0
13    1
14    1
15    1
dtype: int64

In [8]:
# Number of NaNs for each column
train.isnull().sum(axis=0)

survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
ticket        0
fare          0
cabin       687
embarked      2
dtype: int64

### Dataset cleaning and other things to check

**Remove constant features**
<br>We will now try to clean the dataset. It is usually convenient to concatenate train and test into one dataframe and do all feature engineering using
<br> However, remeber that the test set usually we don't know anything about it and we should absorb any info from it.

In [None]:
df = df.loc[:,df.apply(pd.Series.nunique) != 1]

In [None]:
# Remove duplicated features, first I create a copy of my dataframe as it label encode all the categorical data
# if though the two columns has NaN values at the same posisiton it is still working
train1 = train.copy()
#train1['no'] = train1.sex.apply(lambda x: 'male' if x=='female' else 'female')
#train1.loc[5,'no'] =np.nan
#train1.loc[5,'sex'] =np.nan
print(train1.shape)
# For categorical values
for f in train1.select_dtypes(include=[object]).columns:
    train1[f] = train1[f].factorize()[0]
# For numeric features
train1 = train1.T.drop_duplicates().T
train1.shape,train.shape

### Determine Types

In [None]:
nunique = train.nunique(dropna=False)
nunique

In [None]:
# Build a normalized histogram of those values
plt.figure(figsize=(14,6))
_ = plt.hist(nunique.astype(float)/train.shape[0], bins=100)

In [None]:
mask = (nunique.astype(float)/train.shape[0] > 0.7)
train.loc[:10, mask]

In [None]:
mask = (nunique.astype(float)/train.shape[0] < 0.8) & (nunique.astype(float)/train.shape[0] > 0.4)
train.loc[:10, mask]

In [None]:
#It is useful to check some features that looks suspicious
mask = (nunique.astype(float)/train.shape[0] < 0.8) & (nunique.astype(float)/train.shape[0] > 0.4)
train['ticket'].value_counts()

In [None]:
# Divide the features into categorical and numerical
cat_cols = list(train.select_dtypes(include=['object']).columns)
num_cols = list(train.select_dtypes(exclude=['object']).columns)

In [None]:
cat_cols

In [None]:
# We replace NaN with a value -999
train.replace('NaN', -999, inplace=True)

In [None]:
# Let's calculate how many times one feature is greater than the other and create cross tabel out of it.
# select first 42 numeric features
feats = num_cols[:42]

# build 'mean(feat1 > feat2)' plot
gt_matrix(feats,16)

We could hve found that every second feature is greater, not to the second but let's say i+1 feature is greater than the feature i. And, well it could be that this information is about, for example, counters in different periods of time. So, for example, the first feature is how many events happened in the first month. The second feature is how many events happened in the first two month and so kind of cumulative values. And, that is why one feature is always greater than the other. And basically, what information we can extract from this kind of metrics is that we have this group and we can generate new features and these features could be, for example, the difference between two consecutive features. That is how we will extract, for example, the number of events in each month. So, we'll go from cumulative values back to normal values. And, well linear models, say, neural networks, they could do it themselves but tree-based algorithms they could not.

In [None]:
# Plot histogram of features
sns.distplot(train['fare'],bins=5)

If we notice consecutive peaks and by running value_counts we see that it has a lot of counts on 12,24,36.. we could crete a feature modulo 12There's something in there. We will see that the values, the top values, are 12, 24, 36, 60 and so on. So, they can be divided by 12 and well, and what can we do? We want to generate features so we will generate feature like the value of these variable modular 12 or, for example, value of this variable integer division by 12. So, this could really help. In other competition, you could build a variable and see something like that again. And what happened in there, the organizers actually had quantized data. So, they only had data that in our case could be divided by 12. Say 12, 24 and so on. But, they wanted to kind of obfuscate the data probably and they added some noise. And, that is why if you plot an histogram, you will still see the spikes but you will also see something in between the spikes. And so, again, these features in that competition they work quite well and you could dequantize the values and it could really help.

### Check if dataset is shuffled

In [1]:
plt.figure(figsize= [15,6])
plt.plot(train.index,train.survived.rolling(window=5, min_periods=1).mean(),label='Rolling Mean')
plt.xlabel('Index',fontsize='x-large')
plt.ylabel('Survived',fontsize='x-large')
plt.axhline(y= train.survived.mean(), color='r', linestyle='--',label='Mean')
plt.legend(loc='best',framealpha=.9, fontsize='large')
plt.xlim([-10,900])
plt.title('Check if dataset is shuffled',fontsize='xx-large')
ax = fig.gca()
ax.set_xticks(numpy.arange(0, 31, 1))
ax.set_yticks(numpy.arange(0, 31., 1))
plt.scatter(x, y)
plt.grid()
plt.show()

NameError: name 'plt' is not defined

In [None]:
'''
# tickets starting with
train.Ticket.apply(lambda x: x[0]).value_counts()[0:5]
for i in [1,2,3]:
    train['ticket_on_'+str(i)] = train.Ticket.apply(lambda x: 1 if x[0]==i else 0)
'''

In [None]:
scatter_matrix = pd.scatter_matrix(
    train,
    figsize  = [15, 15],
    #marker   = ".",
    #s        = 0.2,
    #diagonal = "kde"
)

for ax in scatter_matrix.ravel():
    ax.set_xlabel(ax.get_xlabel(), fontsize = 20, rotation = 90)
    ax.set_ylabel(ax.get_ylabel(), fontsize = 20, rotation = 0)

In [None]:
sns.heatmap(train.corr(),linewidths=.5,cmap="YlGnBu")

In [None]:
ax = train.mean().sort_values().plot(style='.')
ax.set_xticks(range(0,6))
ax.set_xticklabels(train.mean().sort_values().index, rotation=90)

In [None]:
train.select_dtypes(exclude=[object]).columns

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.head()

In [None]:
fig , ax = plt.subplots(figsize=(6,4))
sns.countplot(x='survived', data=train)
plt.title("Count of Survival")

In [None]:
#Let's find correlation between Numeric Variable
cat = train.drop(columns='survived').loc[:, train.dtypes == object].columns
num = train.drop(columns='survived').loc[:, train.dtypes != object].columns

In [None]:
corr_df=train[num]  #New dataframe to calculate correlation between numeric features
cor= corr_df.corr(method='pearson')

fig, ax =plt.subplots(figsize=(8, 6))
plt.title("Correlation Plot")
sns.heatmap(cor, mask=np.zeros_like(cor, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
#Let's use chi-square test to understand relationship between categorical variables and target variable
csq=chi2_contingency(pd.crosstab(train['survived'], train['sex']))
print("P-value: ",csq[1])
csq2=chi2_contingency(pd.crosstab(train['survived'], train['embarked']))
print("P-value: ",csq2[1])
csq3=chi2_contingency(pd.crosstab(train['survived'], train['pclass']))
print("P-value: ",csq3[1])

# P values for features Sex, Embarked and Pclass are very low. So we can reject our Null Hypothesis which is these features
# are independent and have no relationship with target variable

#### Visulaization

In [None]:
fig, ax=plt.subplots(figsize=(8,6))
sns.countplot(x='survived', data=train, hue='sex')
ax.set_ylim(0,500)
plt.title("Impact of Sex on Survived")
#We can say that Female passangers have higher probability of survival than Male passangers

In [None]:
fig, ax=plt.subplots(figsize=(8,6))
sns.countplot(x='survived', data=train, hue='embarked')
ax.set_ylim(0,500)
plt.title("Impact of Embarked on Survived")
# Ratio of Survived and Not Survived passangers for S and Q Embarked are similar but Passengers from C
# embarked have higer chances of survival.

In [None]:
fig, ax=plt.subplots(figsize=(8,6))
sns.countplot(x='survived', data=train, hue='pclass')
ax.set_ylim(0,400)
plt.title("Impact of Pclass on Survived")

# Passengers from Pclass 3 have lesser chances of Survival while passengers from Pclass 1 have higher chances of survival

In [None]:
fig, ax=plt.subplots(1,figsize=(8,6))
sns.boxplot(x='survived',y='fare', data=train)
ax.set_ylim(0,300)
plt.title("Survived vs Fare")

#Average Fare for passangers who survived is higher than not survived.

In [None]:
train.pclass = train.pclass.astype('str')
print(train.pclass.value_counts())
print(train.sex.value_counts())
# So pclass & sex need One-hot Encoded
# train = pd.get_dummies(train, columns =['pclass', 'sex'])

In [None]:
train.embarked.fillna(train.embarked.value_counts().idxmax(), inplace = True)
test.embarked.fillna(train.embarked.value_counts().idxmax(), inplace = True)

In [None]:
train.head()

In [None]:
train.name = train.name.apply(lambda x: x.lower())
test.name = test.name.apply(lambda x: x.lower())
train.head()

In [None]:
# Extract Title from Name, store in column and plot barplot
train['title'] = train.name.apply(lambda x: re.search('([a-z]+)\.', x).group(1))
test['title'] = test.name.apply(lambda x: re.search('([a-z]+)\.', x).group(1))
sns.countplot(x='title', data=train);
plt.xticks(rotation=45);

In [None]:
train['title'] = train['title'].replace({'mille':'miss', 'mme':'mrs', 'ms':'miss'})
test['title'] = test['title'].replace({'mille':'miss', 'mme':'mrs', 'ms':'miss'})
l = ['mr','mrs','miss','master','dr']
train.title = train.title.apply(lambda x: 'others' if x not in l else x)
test.title = test.title.apply(lambda x: 'others' if x not in l else x)
train.head(10)

In [None]:
train = pd.get_dummies(train, columns =['pclass', 'sex', 'embarked','title'])
test = pd.get_dummies(test, columns =['pclass', 'sex', 'embarked','title'])

In [None]:
train.drop(columns =['name'], inplace=True)
test.drop(columns =['name'], inplace=True)

In [None]:
train.head()

In [None]:
train['no_cabin'] = train.cabin.isnull().astype(int)
train.drop(columns =['cabin'], inplace=True)
test['no_cabin'] = test.cabin.isnull().astype(int)
test.drop(columns =['cabin'], inplace=True)

In [None]:
train.head()

In [None]:
train['ticket2'] = train.ticket.astype('category').cat.codes
test['ticket2'] = test.ticket.astype('category').cat.codes
#train.drop(columns =['ticket'], inplace=True)

In [None]:
train.head()

In [None]:
train.age.fillna(train.age.median(),inplace=True)
test.age.fillna(train.age.median(),inplace=True)

In [None]:
l = train.groupby('ticket2')['ticket2'].count()/len(train)
train['ticket3'] = train.ticket2.apply(lambda x: l[x])
train.drop(columns=['ticket','ticket2'],inplace=True)
train['family_mem']=train.apply(lambda x: x['sibsp']+x['parch'], axis=1)
l = test.groupby('ticket2')['ticket2'].count()/len(test)
test['ticket3'] = test.ticket2.apply(lambda x: l[x])
test.drop(columns=['ticket','ticket2'],inplace=True)
test['family_mem']=test.apply(lambda x: x['sibsp']+x['parch'], axis=1)

In [None]:
train.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='survived'), train.survived, test_size=0.33, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape,train.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
dep = np.arange(1,9)
param_grid = {'max_depth' : dep,'criterion':['gini','entropy']}

# Instantiate a decision tree classifier: clf
clf = DecisionTreeClassifier()

# Instantiate the GridSearchCV object: clf_cv
clf_cv = GridSearchCV(clf, param_grid=param_grid, cv=5)

# Fit it to the data
clf_cv.fit(X_train, y_train)

# Print the tuned parameter and score
print("Tuned Decision Tree Parameters: {}".format(clf_cv.best_params_))
print("Best score is {}".format(clf_cv.best_score_))
print("Test score is {}".format(clf_cv.score(X_test,y_test)))

In [None]:
test.info()

In [None]:
test1 = test.copy() # before becoming an array due to the scaler

In [None]:
# Fit with all the training data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = train.drop(columns='survived')
y_train = train.survived
X_train = scaler.fit_transform(X_train)
test1.fare.fillna(test1.fare.median(),inplace=True)
test1 = scaler.transform(test1)
# Instantiate a decision tree classifier: clf
clf = DecisionTreeClassifier()

# Instantiate the GridSearchCV object: clf_cv
clf_cv = GridSearchCV(clf, param_grid=param_grid, cv=5)

# Fit it to the data
clf_cv.fit(X_train, y_train)
predictions = clf_cv.predict(test1)
submission = pd.DataFrame({ 'PassengerId': test.index,
                            'Survived': predictions })
submission.to_csv("submission.csv", index=False)
submission

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [100, 150, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [3,4,5,6,7,8]}
    #'criterion' :['gini', 'entropy']

rfc_cv = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid, cv= 5, n_iter=5)
rfc_cv.fit(X_train, y_train)

# Print the tuned parameter and score
print("Tuned Decision Tree Parameters: {}".format(rfc_cv.best_params_))
print("Best score is {}".format(rfc_cv.best_score_))
print("Test score is {}".format(rfc_cv.score(X_test,y_test)))

In [None]:
from xgboost.sklearn import XGBClassifier 
import scipy.stats as st

one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

params = {  
    "n_estimators": st.randint(50, 200),
    "max_depth": st.randint(3, 15),
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

xgb = XGBClassifier()  

from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(xgb, params,cv= 5, n_iter=15,scoring = 'accuracy', n_jobs=-1)  
gs.fit(X_train, y_train)  

# Print the tuned parameter and score
print("Tuned Decision Tree Parameters: {}".format(gs.best_params_))
print("Best score is {}".format(gs.best_score_))
print("Test score is {}".format(gs.score(X_test,y_test)))

In [None]:
from xgboost.sklearn import XGBClassifier 
import scipy.stats as st

xgb = XGBClassifier()  

gs = xgb.fit(X_train, y_train)  

y_pred = gs.predict(X_test)

# Print the tuned parameter and score
print("Test score is {}".format(gs.score(X_test,y_test)))

In [None]:
y_pred

In [None]:
df = pd.DataFrame(y_test)

In [None]:
df['predicted'] = y_pred
df.rename(columns={'survived':'true'},inplace=True)

In [None]:
df['is_different'] = np.abs(df.predicted - df.true)

In [None]:
mistakes = df.query('is_different==1').index

In [None]:
correct = df.query('is_different==0').index

In [None]:
#X_test.loc[correct].describe()

In [None]:
X_test.loc[mistakes].describe()
#len(mistakes)

In [None]:
importances = pd.DataFrame(X_test.columns)
importances.rename(columns={0:'feature'},inplace=True)

In [None]:
importances['importance'] = pd.Series(gs.feature_importances_)

In [None]:
importances.sort_values('importance',ascending=False).set_index('feature').plot(kind='bar',figsize=(15,8))

In [None]:
test = pd.read_csv('data/test.csv')
pred = gs.predict(test)
submission = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                            'Survived': df.predicted })
submission.to_csv("submission.csv", index=False)

In [None]:
# This script shows you how to make a submission using a few
# useful Python libraries.
# It gets a public leaderboard score of 0.76077.
# Maybe you can tweak it and do better...?

import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the data
train_df = pd.read_csv('../input/train.csv', header=0)
test_df = pd.read_csv('../input/test.csv', header=0)

# We'll impute missing values using the median for numeric columns and the most
# common value for string columns.
# This is based on some nice code by 'sveitser' at http://stackoverflow.com/a/25562948
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

feature_columns_to_use = ['Pclass','Sex','Age','Fare','Parch']
nonnumeric_columns = ['Sex']

# Join the features from train and test together before imputing missing values,
# in case their distribution is slightly different
big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

# XGBoost doesn't (yet) handle categorical features automatically, so we need to change
# them to columns of integer values.
# See http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing for more
# details and options
le = LabelEncoder()
for feature in nonnumeric_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

# Prepare the inputs for the model
train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
test_X = big_X_imputed[train_df.shape[0]::].as_matrix()
train_y = train_df['Survived']

# You can experiment with many other options here, using the same .fit() and .predict()
# methods; see http://scikit-learn.org
# This example uses the current build of XGBoost, from https://github.com/dmlc/xgboost
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_y)
predictions = gbm.predict(test_X)

# Kaggle needs the submission to have a certain format;
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# for an example of what it's supposed to look like.
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("submission.csv", index=False)