In [305]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### **Data visualization**

Importing all necessary libraries and increase plot sizes:

In [306]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

sns.set(style="whitegrid", font_scale=1.5, rc={'figure.figsize':(15, 8)})

In [307]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')

Let's look on what titanic dataset contain:

In [308]:
train_data.head()

In [309]:
train_data.describe()

Check missing values in train and test datasets:

In [310]:
train_data.isna().sum()

In [311]:
test_data.isna().sum()

As we see age, cabin, embarked columns in train dataset and Age, Cabin, Fare columns have missing values


In [312]:
columns_to_plot = train_data.select_dtypes(exclude='object').columns.tolist() + ['Cabin', 'Embarked', 'Sex']
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(15, 30))
index = 0
for axes_row in axes:
    for axis in axes_row:
        sns.histplot(columns_to_plot[index], ax=axis, kde=False)
        index += 1

**Let's display correlation matrix:**

In [313]:
corr_matrix = train_data.corr()
sns.heatmap(corr_matrix, annot=True)
pass

# Data processing

Let's 

In [314]:
data = [train_data, test_data]

for i in range(2):
    data[i]['TicketLength'] = data[i].Ticket.apply(lambda x: len(str(x)))
    data[i]['FamilySize'] = data[i].SibSp + data[i].Parch
    data[i]['IsAlone'] = data[i].FamilySize.apply(lambda x: 0 if x > 0 else 1)
    data[i]['Mr'] = data[i].Name.apply(lambda name: 1 if 'Mr.' in name else 0)
    data[i]['Mrs'] = data[i].Name.apply(lambda name: 1 if 'Mrs.' in name else 0)
    data[i]['Miss'] = data[i].Name.apply(lambda name: 1 if 'Miss.' in name else 0)
    data[i]['Master'] = data[i].Name.apply(lambda name: 1 if 'Master' in name else 0)
    data[i].drop(['Name', 'Ticket'], axis='columns', inplace=True)

data[0].head()

In [315]:
sex_encoder = LabelEncoder()
data[0]['Sex'] =  sex_encoder.fit_transform(data[0]['Sex'])
data[1]['Sex'] = sex_encoder.fit_transform(data[1]['Sex'])
data[0].head()

#### **FILLING MISSING VALUES**

Let's first of all fill *'Embarked'* column from train dataset (now it is data[0] variable) by the mode and *Fare* columns from test dataset (now it is data[1] column) by the mean. 

In [316]:
data[0].Embarked.fillna(data[0].Embarked.mode().iloc[0], inplace=True)
data[1].Fare.fillna(data[1].Fare.mean(), inplace=True)
data[0] = pd.get_dummies(data[0], columns=['Embarked'])
data[1] = pd.get_dummies(data[1], columns=['Embarked'])

data[0].head()

# Imputing NAN values

In [317]:
data[0].drop(['Cabin', 'PassengerId'], axis=1, inplace=True)
data[1].drop(['Cabin', 'PassengerId'], axis=1, inplace=True)

In [318]:
data[0].head()

In [319]:
knn_imput = KNNImputer(n_neighbors=3)
data[0]['Age'] = knn_imput.fit_transform(data[0])[:, 3]
data[1]['Age'] = knn_imput.fit_transform(data[1])[:, 2]

In [320]:
X_train, y_train, X_test = data[0].drop('Survived', axis=1), data[0].Survived, data[1]
scaler = StandardScaler()
features = ['Age', 'Fare', 'TicketLength']
X_train[features] = scaler.fit_transform(X_train[features])
X_test[features] = scaler.fit_transform(X_test[features])

In [321]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

In [358]:
import lightgbm as lgbm
import warnings 
warnings.filterwarnings('ignore')

train_set = lgbm.Dataset(X_train, y_train)
valid_set = lgbm.Dataset(X_valid, y_valid)

params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,        
    }

modelL = lgbm.train(params, train_set = train_set, num_boost_round=1000,
                   early_stopping_rounds=50, verbose_eval=10, valid_sets=valid_set)



In [355]:
accuracy_score(y_valid, np.around(modelL.predict(X_valid)))
pred_res = np.around(modelL.predict(X_test))
pred_res = pd.DataFrame({
    'PassengerId' : np.arange(892, 1310, 1),
    'Survived' : pred_res})
pred_res.Survived = pred_res.Survived.astype(int)
pred_res.to_csv('res.csv', index=False)


In [356]:
pred_res