In [1]:
import pandas
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

## Loading datasets

In [2]:
data_folder = "./data/"
train_file = "train.csv"
test_file = "test.csv"

In [3]:
train_data = pandas.read_csv(data_folder+train_file)
test_data = pandas.read_csv(data_folder+test_file)
all_data = pandas.concat([train_data, test_data]) # The entire data: train + test.

In [4]:
all_data['Title'] = all_data['Name']
# Cleaning name and extracting Title
for name_string in all_data['Name']:
    all_data['Title'] = all_data['Name'].str.extract('([A-Za-z]+)\.', expand=True)

# Replacing rare titles with more common ones
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
all_data.replace({'Title': mapping}, inplace=True)
titles = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev']
for title in titles:
    # age_to_impute = all_data.groupby('Title')['Age'].median()[titles.index(title)]
    age_to_impute = all_data.groupby('Title')['Age'].median().iloc[titles.index(title)]
    all_data.loc[(all_data['Age'].isnull()) & (all_data['Title'] == title), 'Age'] = age_to_impute
    
# Substituting Age values in train_data and test_data:
train_data['Age'] = all_data['Age'][:891]
test_data['Age'] = all_data['Age'][891:]

# Dropping Title feature
all_data.drop('Title', axis = 1, inplace = True)

 - **Adding Family_Size**
 
That's just Parch + SibSp.

In [5]:
all_data['Family_Size'] = all_data['Parch'] + all_data['SibSp']

# Substituting Age values in train_data and test_data:
train_data['Family_Size'] = all_data['Family_Size'][:891]
test_data['Family_Size'] = all_data['Family_Size'][891:]

 - **Adding Family_Survival**
 
 This feature is from [S.Xu's kernel](https://www.kaggle.com/shunjiangxu/blood-is-thicker-than-water-friendship-forever), he groups families and people with the same tickets togerher and researches the info. I've cleaned the code a bit but it still does the same, I left it as is. For comments see the original kernel.

In [6]:
all_data['Last_Name'] = all_data['Name'].apply(lambda x: str.split(x, ",")[0])
all_data['Fare'] = all_data['Fare'].fillna(all_data['Fare'].mean())

DEFAULT_SURVIVAL_VALUE = 0.5
all_data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in all_data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      all_data.loc[all_data['Family_Survival']!=0.5].shape[0])

Number of passengers with family survival information: 420


In [7]:
for _, grp_df in all_data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(all_data[all_data['Family_Survival']!=0.5].shape[0]))

# # Family_Survival in train_data and test_data:
train_data['Family_Survival'] = all_data['Family_Survival'][:891]
test_data['Family_Survival'] = all_data['Family_Survival'][891:]

Number of passenger with family/group survival information: 546


In [8]:
all_data['Fare'] = all_data['Fare'].fillna(all_data['Fare'].median())
# Making Bins
all_data['FareBin'] = pandas.qcut(all_data['Fare'], 5)

label = LabelEncoder()
all_data['FareBin_Code'] = label.fit_transform(all_data['FareBin'])

train_data['FareBin_Code'] = all_data['FareBin_Code'][:891]
test_data['FareBin_Code'] = all_data['FareBin_Code'][891:]

train_data.drop(['Fare'], axis=1,inplace=True)
test_data.drop(['Fare'], axis=1,inplace=True)

In [9]:
all_data['AgeBin'] = pandas.qcut(all_data['Age'], 4)

label = LabelEncoder()
all_data['AgeBin_Code'] = label.fit_transform(all_data['AgeBin'])

train_data['AgeBin_Code'] = all_data['AgeBin_Code'][:891]
test_data['AgeBin_Code'] = all_data['AgeBin_Code'][891:]

train_data = train_data.drop(labels=['Age'], axis=1)
test_data = test_data.drop(labels=['Age'], axis=1)

In [10]:
train_data['Sex'].replace(['male','female'],[0,1],inplace=True)
test_data['Sex'].replace(['male','female'],[0,1],inplace=True)

train_data.drop(['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
               'Embarked'], axis = 1, inplace = True)
test_data.drop(['Name','PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
              'Embarked'], axis = 1, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Sex'].replace(['male','female'],[0,1],inplace=True)
  train_data['Sex'].replace(['male','female'],[0,1],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Sex'].replace(['male','female'],[0,1],inplace=True)
  test_data['Sex'].replace(['male','fem

In [11]:
train_data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Family_Size,Family_Survival,FareBin_Code,AgeBin_Code
0,0,3,0,1,0.5,0,0
1,1,1,1,1,0.5,4,3
2,1,3,1,0,0.5,1,1


# Training

 - **Creating X and y**

In [12]:
xs_train = train_data.drop('Survived', axis=1)
ys_train = train_data['Survived']
xs_test = test_data.copy()

In [13]:
xs_train.columns

Index(['Pclass', 'Sex', 'Family_Size', 'Family_Survival', 'FareBin_Code',
       'AgeBin_Code'],
      dtype='object')

In [14]:
xs_train.dtypes

Pclass               int64
Sex                  int64
Family_Size          int64
Family_Survival    float64
FareBin_Code         int64
AgeBin_Code          int64
dtype: object

 - **Scaling features**

In [15]:
std_scaler = StandardScaler()
xs_train = std_scaler.fit_transform(xs_train)
xs_test = std_scaler.transform(xs_test)

 - **Grid Search CV**
 
 Here I use KNN.

In [16]:
n_neighbors = [6,7,8,9,10,11,12,14,16,18,20,22]
algorithm = ['auto']
weights = ['uniform', 'distance']
leaf_size = list(range(1,50,5))
hyperparams = {'algorithm': algorithm, 'weights': weights, 'leaf_size': leaf_size, 
               'n_neighbors': n_neighbors}
gd=GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, 
                cv=10, scoring = "roc_auc")
gd.fit(xs_train, ys_train)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits
0.8790514387573211
KNeighborsClassifier(leaf_size=16, n_neighbors=18)




In case you get a different result here (result may vary), what I got was:

> KNeighborsClassifier(algorithm='auto', leaf_size=26, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=18, p=2, weights='uniform')

This gave 0.884103388207 ROC_AUC score (not accuracy score!). I had a ton of models with roc_auc around 0.93-0.94 but when tested, they mostly showed lower results. Doesn't mean they are worse though.

 - **Using a model found by grid searching**

In [17]:
gd.best_estimator_.fit(xs_train, ys_train)
y_pred = gd.best_estimator_.predict(xs_test)

When I submitted the result, the model I've specified above yielded [0.82775] public score.

- **Using another K**

This guy comes from empirical messing around with amount of neighbors in KNN. It's the same as the above one, but with another n:

In [18]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=26, metric='minkowski', 
                           metric_params=None, n_jobs=1, n_neighbors=6, p=2, 
                           weights='uniform')
knn.fit(xs_train, ys_train)
y_pred = knn.predict(xs_test)

Being a fan of simple models there's no way I couldn't try playing with n_neighbors lowering it (the lower it is --> the less complex the model is, though too simple model is bad news too).

- **Making submission**

In [19]:
temp = pandas.DataFrame(pandas.read_csv("./data/test.csv")['PassengerId'])
temp['Survived'] = y_pred
temp.to_csv("./submission.csv", index = False)