In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns #graphs
import matplotlib.pyplot as plt
plt.rc("font", size=14)

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:


#---------------------------------------------------------------------------------------------------------------------------

def count_categorical(data,output=True):
    #cicle through data.columns checking for their types and saving those that have 'object' type in 'categorical'
    categorical = [var for var in data.columns if data[var].dtype=='object']
    if(output == True):
        #the length of categorical is the number of categorical variables
        print('There are {} categorical variables\n'.format(len(categorical)))
    
        #check No labels per feature
        for var in categorical:
            no_unique_values = len(data[var].unique())
            n_missing_values = data[var].isnull().sum()
        
            print(var + " has " + str(no_unique_values) +
                  " distinct labels, with " + 
                  str(n_missing_values) + " missing values representing " + str( round(data[var].isnull().mean() * 100,2) ) + "%" ) 
        
    return categorical
    
#---------------------------------------------------------------------------------------------------------------------------
    
def count_numerical(data,output=True):
    #cicle through data.columns checking for their types and saving those that have 'object' type in 'categorical'
    numerical = [var for var in data.columns if data[var].dtype!='object']
    if (output == True):
           #the length of categorical is the number of categorical variables
        print('There are {} numerical variables\n'.format(len(numerical)))

          #check No labels per feature
        for var in numerical:
            print(var + " has " +  str(data[var].isnull().sum()) + " missing values representing " + str( round(data[var].isnull().mean() * 100,2) ) + "%"  ) 
    return numerical
    
    
#---------------------------------------------------------------------------------------------------------------------------
    
    
def boxplot_hist_numerical_data(data,numerical):
    for j in range(0,len(numerical)):
        plt.subplot(1,2,1)
        data.boxplot(column = numerical[j])
        
        plt.subplot(1,2,2)
        data[numerical[j]].hist().set_xlabel(numerical[j])
        
        plt.tight_layout()
        plt.show()
        
#---------------------------------------------------------------------------------------------------------------------------


def heatmap(data,title):
    correlation = data.corr()
    plt.figure(figsize=(16,12))
    plt.title(title)
    ax = sns.heatmap(correlation, square=True, annot=True, fmt='.2f', linecolor='white')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=30)           
    plt.show()
#---------------------------------------------------------------------------------------------------------------------------

def correlation_graphs(data,columns): #takes a bit
    sns.pairplot(data[columns], kind='scatter', diag_kind='hist', palette='Rainbow')
    plt.show()
    
    
#---------------------------------------------------------------------------------------------------------------------------

def absolute_frequency(data,categorical):
    for var in categorical:
        print("-------------------------------" + var + "-------------------------------")
        print(data[var].value_counts())
    
#---------------------------------------------------------------------------------------------------------------------------

def relative_frequency(data,categorical):
    for var in categorical:
        print("-------------------------------" + var + "-------------------------------")
        print(data[var].value_counts()/np.float(data.shape[0]))

In [None]:
class DistributionScaler(BaseEstimator, TransformerMixin):   #transformer into diferent scales or normalization/standartization
    def __init__(self, strategy = None): # no *args or **kargs      
        self.strategy = strategy
        self.estimator = None
    def fit(self, X):
        if(self.strategy=="StandardScaler"):
            self.estimator=StandardScaler().fit(X)
        elif(self.strategy=="MinMaxScaler"):
            self.estimator=MinMaxScaler().fit(X)
        elif(self.strategy=="MaxAbsScaler"):
            self.estimator=MaxAbsScaler().fit(X)
        elif(self.strategy=="RobustScaler"):
            self.estimator=RobustScaler(quantile_range=(25, 75)).fit(X)
        elif(self.strategy=="PowerTransformer_Yeo_Johnson"):
            self.estimator=PowerTransformer(method='yeo-johnson').fit(X)
        elif(self.strategy=="PowerTransformer_Box_Cox"):
            self.estimator=PowerTransformer(method='box-cox').fit(X)
        elif(self.strategy=="QuantileTransformer_Normal"):
            self.estimator=QuantileTransformer(output_distribution='normal').fit(X)
        elif(self.strategy=="QuantileTransformer_Uniform"):
            self.estimator=QuantileTransformer(output_distribution='uniform').fit(X)
        elif(self.strategy=="Normalizer"):
            self.estimator=Normalizer().fit(X)
        return self  # nothing else to do  
    def transform(self, X):      
        if (self.estimator != None):
            return self.estimator.transform(X)
        else:
            if (isinstance(X,(pd.core.frame.DataFrame))):
                return X.to_numpy()
            else:
                return X
#-----------------------------------------------------------------------------------------------------------   


class OutlierApproacher(BaseEstimator, TransformerMixin): 
    def __init__(self, strategy = None):#,threshold=3): # add treatment later for removal or imputation, add default/custom threshold = 1.5 for IQR and threshold = 3 for ZScore
        self.strategy = strategy
        #self.threshold = threshold
        self.iqr = None
        self.q3 = None
        self.q1 = None
        self.zscore = None
    def fit(self, X):
        if (self.strategy=="ZScore"):
            self.zscore = np.abs(stats.zscore(X))
        if (self.strategy=="IQR"):
            Q1 = wines.quantile(0.25)
            Q3 = wines.quantile(0.75)
            self.iqr = Q3 - Q1
        return self
        
    def transform(self, X):
        if (self.strategy=="ZScore"):
            return X[(self.zscore < 3).all(axis=1)].to_numpy()
        if (self.strategy=="IQR"):
            return X[~((X < (Q1 - 1.5 * IQR)) |(X > (Q3 + 1.5 * IQR))).any(axis=1)].to_numpy()
        else:
            return X.to_numpy()

In [None]:
data_train = pd.read_csv('../input/titanic/train.csv')
data_test = pd.read_csv('../input/titanic/test.csv')

In [None]:
data_train.shape

In [None]:
data_train.columns

In [None]:
data_train.head()

In [None]:
data_train.info()

In [None]:
numerical = count_numerical(data_train)

In [None]:
categorical = count_categorical(data_train)

In [None]:
absolute_frequency(data_train,categorical)

In [None]:
relative_frequency(data_train,categorical)

In [None]:
boxplot_hist_numerical_data(data_train,numerical)

In [None]:
heatmap(data_train,'Wines')

In [None]:
#remove name column
data_train_cut = data_train.drop(["Name"],axis=1)

In [None]:
data_train_cut["Family Ties"] = data_train_cut["SibSp"] + data_train_cut["Parch"]
data_train_cut = data_train_cut.drop(["SibSp","Parch"],axis=1)

In [None]:
pd.options.display.max_rows = 4000
data_train.sort_values(["Ticket"],ascending=True)

In [None]:
pd.options.display.max_rows = 4000
data_train.sort_values(["Name"],ascending=True)

In [None]:
pd.options.display.max_rows = 4000
data_train.sort_values(["Fare"],ascending=False)

In [None]:
pd.options.display.max_rows = 4000
data_train.sort_values(["Cabin"],ascending=True)

In [None]:
data_train.groupby(["Cabin"])["Fare"].value_counts()