In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")

import seaborn as sns

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **LOAD AND CHECK DATA**

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_PassengerId= test_df["PassengerId"]

In [None]:
train_df.columns

In [None]:
train_df.head()

In [None]:
train_df.describe()

# **VARİABLE DESCRİPTİON**

1. PassengerId : İD NUMBER TO EACH PASSENGER
1. Survived : PASSENGER SURVİVE(1) OR DİED(0)
1. Pclass : PASSENGER CLASS
1. Name 
1. Sex : GENDER OF PASSENGER
1. Age : AGE OF PASSENGER
1. SibSp : NUMBER OF SİBLİNS/ SPOUSES
1. Parch : NUMBER OF PARENTS/ CHİLDREN
1. Ticket : TİCKET NUMBER
1. Fare : AMOUNT OF MONEY SPENT ON TİCKET
1. Cabin : CABİN CATEGORY
1. Embarked : PORT WHERE PASSENGER EMBARKED (C = CHERBOURG , Q = QUEENSTOWN , S = SOUTHAMPTON)

In [None]:
train_df.info()

> # Univariate Variable Analysis
    * Categorical Variable : Survived , Sex , Pclass , Embarked , Cabin , Name, Ticket,                             Sibsp and Parch
    * Numerical Variable : PassengerId , Fare,age

In [None]:
def bar_plot(variable):
    """
    input : variable ex : "Sex"
    output : bar plot & value count 
    """
    
    #get feature 
    var = train_df[variable]
    varValue = var.value_counts()

    # visualize 

    plt.figure(figsize =(9,3))
    plt.bar(varValue.index,varValue)
    plt.xticks(varValue.index , varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{} : \n{}".format(variable,varValue))

In [None]:
category1 =["Survived","Sex","Pclass","Embarked","SibSp","Parch"]
for c in category1:
    bar_plot(c)

In [None]:
category2 = ["Cabin","Name","Ticket"]
for c in category2:
    print("{} \n".format(train_df[c].value_counts()))

> # Numerical Variable

In [None]:
def plot_hist(variable):
    plt.figure(figsize =(9,3))
    plt.hist(train_df[variable],bins=10)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist".format(variable))
    plt.show()

In [None]:
numericVar = ["Fare","Age","PassengerId"]
for c in numericVar:
    plot_hist(c)

# Basic Data Analysis

* Pclass - Survived 
* Sex - Survived
* SibSp - Survived
* Parch - Survived

In [None]:
# Pclass vs Survived
train_df[["Pclass","Survived"]].groupby(["Pclass"],as_index = False).mean().sort_values(by ="Survived",ascending =False)

In [None]:
# Sex vs Survived
train_df[["Sex","Survived"]].groupby(["Sex"],as_index = False).mean().sort_values(by ="Survived",ascending =False)

In [None]:
# SibSp vs Survived
train_df[["SibSp","Survived"]].groupby(["SibSp"],as_index = False).mean().sort_values(by ="Survived",ascending =False)

In [None]:
# Parch vs Survived
train_df[["Parch","Survived"]].groupby(["Parch"],as_index = False).mean().sort_values(by ="Survived",ascending =False)

# **Outlier Detection**

In [None]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        
        #1st quartile
        Q1 = np.percentile(df[c],25)
        #3rd quartile
        Q3 = np.percentile(df[c],75)

        #IQR
        IQR = Q3-Q1

        #Outlier Step
        outlier_step = IQR *1.5

        #detect outlier and their indeces
        outlier_list_col = df[(df[c] <Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index

        # store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i , v in outlier_indices.items() if v > 2 )
    return multiple_outliers

In [None]:
train_df.loc[detect_outliers(train_df,["Age","SibSp","Parch","Fare"])]

In [None]:
#drop outliers 
#train_df = train_df.drop(detect_outliers(train_df,["Age","SibSp","Parch","Fare"],axis = 0).reset_index(drop = True)

# **Missing Value**

* Find Missing Value
* Fill Missing Value

In [None]:
train_df_len = len(train_df)
train_df = pd.concat([train_df,test_df],axis = 0).reset_index(drop= True)

In [None]:
train_df.head()

> # Find Missing Value

In [None]:
train_df.columns[train_df.isnull().any()]

In [None]:
train_df.isnull().sum()

> # Fill Missing Value

* Embarked has 2 missing value 
* Fare has only 1 

In [None]:
train_df[train_df["Embarked"].isnull()]

In [None]:
train_df.boxplot(column="Fare",by ="Embarked")
plt.show()

In [None]:
train_df["Embarked"]=train_df["Embarked"].fillna("C")
train_df[train_df["Embarked"].isnull()]

In [None]:
train_df[train_df["Fare"].isnull()]

In [None]:
np.mean(train_df[train_df["Pclass"]==3]["Fare"])

In [None]:
train_df["Fare"] =train_df["Fare"].fillna(np.mean(train_df[train_df["Pclass"]==3]["Fare"]))

In [None]:
train_df[train_df["Fare"].isnull()]