In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
TITANIC_PATH = '/kaggle/input/titanic'

def fetch_titanic_data(file, titanic_data_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_data_path, file)
    return pd.read_csv(csv_path)


In [None]:
# loading data 
train_data = fetch_titanic_data("train.csv")
test_data = fetch_titanic_data("test.csv")

### Visualizing data

In [None]:
train_data.head()

In [None]:
train_data.info()

Null values in age, embarked, and cabin. Ignoring cabin as it largely consists of null values.

In [None]:
train_data.describe()

In [None]:
train_data["Survived"].value_counts(),train_data["Pclass"].value_counts(), train_data["Sex"].value_counts(), train_data["Embarked"].value_counts()

### Creating labels

In [None]:
train_y , train_X = train_data['Survived'], train_data.drop(['Survived'],axis = 1),
test_X = test_data

### check for cardinality and unique categories in test data

In [None]:
# get list of categorical variables
s = (train_X.dtypes == 'object')
categorical_cols = list(s[s].index)

# Columns that can be safely label encoded
good_label_cols = [col for col in categorical_cols if 
                   set(train_X[col]) == set(test_X[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(categorical_cols)-set(good_label_cols))

good_label_cols,bad_label_cols

Most of the category columns in the test data have values not present in the train data

In [None]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: train_X[col].nunique(), categorical_cols))
d = dict(zip(categorical_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

d

In [None]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in categorical_cols if train_X[col].nunique() < 15]
low_cardinality_cols

In [None]:
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(categorical_cols)-set(low_cardinality_cols))

high_cardinality_cols

In [None]:
# Select numerical columns
numerical_cols = [cname for cname in train_X.columns if 
                train_X[cname].dtype in ['int64', 'float64']]
numerical_cols.remove('PassengerId')
numerical_cols

In [None]:
cols = categorical_cols + numerical_cols

### null/missing values handling and pipeline creation 

#### numeric pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline(steps = [("imputer", SimpleImputer(strategy = "median"))])

#### category pipeline

In [None]:
cat_pipeline = Pipeline(steps = [
                       ("imputer", SimpleImputer(strategy='most_frequent', fill_value='missing')), 
                       ("onehot", OneHotEncoder(sparse=False,handle_unknown='ignore')),])

join both categorical and numerical pipeline

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer( [
    ("num", num_pipeline, numerical_cols),
    ("cat", cat_pipeline, categorical_cols)
])

In [None]:
train_X_prepared = train_X[cols]

### creating model 

In [None]:
from sklearn.ensemble import RandomForestClassifier


from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

forest_clf = RandomForestClassifier(n_estimators = 1200, max_features = 'sqrt', max_depth= 60, bootstrap =False)

pipeline = Pipeline(steps = [('preprocessor', preprocessor), 
                             ('model', forest_clf)])

In [None]:
pipeline.fit(train_X_prepared,train_y)

In [None]:
predictions = pipeline.predict(test_X)

In [None]:
output = pd.DataFrame({
        "PassengerId": test_X["PassengerId"],
        "Survived": predictions
    })
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
output