# Importing Libraries and Loading Data

In [2]:
# from preamble import *
%matplotlib inline

import numpy as np 
import scipy as sp 
import pandas as pd
import matplotlib as mpl
import matplotlib.cm as cm 
import matplotlib.pyplot as plt
 
#from pandas.tools.plotting import scatter_matrix
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings('ignore')
import string
import math
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sklearn


from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


#importing from sklearn
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve,GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as sm
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#import xgboost as xgb
from sklearn.metrics import roc_curve, auc
import scikitplot as skplt #conda install -c conda-forge scikit-plot
from sklearn.metrics import accuracy_score 
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

In [None]:
train = pd.read_csv('data/titanic_train.csv')
test = pd.read_csv("data/titanic_test.csv")
test_labels = pd.read_csv("data/titanic_test_labels.csv")



median_fare = test.groupby(['Pclass', 'Parch']).Fare.median()[3][0]
# Filling the missing value in Fare with the median Fare of 3rd class passenger who has Parch 0.
test['Fare'] = test['Fare'].fillna(median_fare)
train['Embarked'] = train['Embarked'].fillna('S')
train['Age']  = train.groupby(['Pclass', 'SibSp'])['Age'].apply(lambda x: x.fillna(x.median()))

# remember to fill in here, this one really very special case pertaining to this project

train['Age'] = train.Age.fillna(11)
test['Age'] = test.groupby(['Pclass', 'SibSp'])['Age'].apply(lambda x: x.fillna(x.median()))

train['Cabin'] = train['Cabin'].fillna('M').astype(str).apply(lambda cabin: cabin[0])
idx = train[train['Cabin'] == 'T'].index
train.loc[idx, 'Cabin'] = 'A'

test['Cabin'] = test['Cabin'].fillna('M').astype(str).apply(lambda cabin: cabin[0])

# Create function that take name and separates it into title, family name and deletes all puntuation from name column:
def name_sep(data):
    families=[]
    titles = []
    new_name = []
    #for each row in dataset:
    for i in range(len(data)):
        name = data.iloc[i]
        # extract name inside brakets into name_bracket:
        if '(' in name:
            name_no_bracket = name.split('(')[0] 
        else:
            name_no_bracket = name
            
        family = name_no_bracket.split(",")[0]
        title = name_no_bracket.split(",")[1].strip().split(" ")[0]
        
        #remove punctuations accept brackets:
        for c in string.punctuation:
            name = name.replace(c,"").strip()
            family = family.replace(c,"").strip()
            title = title.replace(c,"").strip()
            
        families.append(family)
        titles.append(title)
        new_name.append(name)
            
    return families, titles, new_name 


train['Surname'], train['Title'], train['Newname']  = name_sep(train.Name)
test['Surname'], test['Title'], test['Newname'] = name_sep(test.Name)
train['Title'] = train['Title'].replace(['Ms', 'Mlle'],'Miss')
train['Title'] = train['Title'].replace(['Mme'],'Mrs')
train['Title'] = train['Title'].replace(['Dr','Rev','the','Jonkheer','Lady','Sir', 'Don'],'Nobles')
train['Title'] = train['Title'].replace(['Major','Col', 'Capt'],'Navy')
train.Title.value_counts()



test['Title'] = test['Title'].replace(['Ms','Dona'],'Miss')
test['Title'] = test['Title'].replace(['Dr','Rev'],'Nobles')
test['Title'] = test['Title'].replace(['Col'],'Navy')
test.Title.value_counts()



train_categorical_features = ['Pclass', 'Sex','Title','Cabin', 'Embarked']

# No need to use sklearn's encoders
# pandas has a pandas.get_dummies() function that takes in a series
#     and returns a HOT encoded dataframe of that series
#     use the add_prefix() method of dataframe to add the feature name in front of the category name
#     then join the dataframe sideways (similar to pd.concat([train, dummies], axis=1))
for feature in train_categorical_features:
    dummies = pd.get_dummies(train[feature]).add_prefix(feature + '_')
    train = train.join(dummies)
    
    
    
test_categorical_features = ['Pclass', 'Sex','Title', 'Cabin', 'Embarked']

# No need to use sklearn's encoders
# pandas has a pandas.get_dummies() function that takes in a series
#     and returns a HOT encoded dataframe of that series
#     use the add_prefix() method of dataframe to add the feature name in front of the category name
#     then join the dataframe sideways (similar to pd.concat([train, dummies], axis=1))
for feature in test_categorical_features:
    dummies = pd.get_dummies(test[feature]).add_prefix(feature + '_')
    test = test.join(dummies)
    
    
    
drop_column = ['Pclass','Name','Sex','Cabin', 'Embarked','Surname','Title','Newname', 'Ticket', 'PassengerId']
train.drop(drop_column, axis=1, inplace = True)

drop_column = ['Pclass','Name','Sex','Cabin', 'Embarked','Surname','Title','Newname', 'Ticket', 'PassengerId']
test.drop(drop_column, axis=1, inplace = True)

# Cross Validation


Before we start reading, do read this [link on why in practice, researchers like to first split the given data into train-validation-test set instead of just performing cross validation via train-test](https://stats.stackexchange.com/questions/152907/how-do-you-use-the-test-dataset-after-cross-validation)

## Introduction to the idea of Cross Validation with Train Test Split

[Scikt learn Cross Validation Document](https://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics)


Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called **overfitting**. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set **X_test, y_test**. Note that the word “experiment” is not intended to denote academic use only, because even in commercial settings machine learning usually starts out experimentally. Here is a flowchart of typical cross validation workflow in model training. The best parameters can be determined by **[grid search techniques](https://scikit-learn.org/stable/modules/grid_search.html).**

<img src="files/images/cv.png" width="400">

Before performing out modelling to our test data, we can split the training data in two using **`train_test_split`** for training and validating your model. Then, when you have a good performance on the validation part of the training data and are happy with your model, you can apply the model to the test data and submit your predicted survival values for the test. For beginners who do not know what this step is, I will provide a brief yet intuitive explanation. But do try and fiddle around with the 4 new variables I created, namely, x_train, y_train, x_test, y_test. 


So the idea is we split our training data **X** into 4 datasets, x_train means the training set without the outcome values (no survived column), x_train will serve as the training set. Now y_train set is the set of survived values corresponding to the x_train set. Similarly, we have x_test which is also extracted from the training set (PAY SPECIAL ATTENTION THAT x_test IS NOT REFERRING TO THE TEST SET given to us), but serving the purpose as test set here. And y_test set is the set of survived values corresponding to the x_test set.

        x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                            random_state=0, shuffle = True)

Now `train_size = 0.8`, `test_size = 0.2` just means we split the data set into 80 percent of training data, and 20 percent test data.



`Random_state` is basically used for reproducing your problem the same every time it is run. If you do not use a random state in **`train_test_split`**, every time you make the split you might get a different set of train and test data points and will not help you in debugging in case you get an issue. Note, it does not matter which integer value you use for random state, as long as you set it to one integer value and keep to it throughout.


We can then build a model, say **logistic regression** and call **`fit`** on the our training set x_train (which is 80% of our X) and evaluate this fitted set of parameters on our x_test. 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#1 In this tutorial, we will used the titanic data set.
X = train.drop("Survived", axis=1)
y = train["Survived"]

#2 split data and labels into a training and a test set. Note the parameters we can put inside, we will explain
#  the argument shuffle later on in the chapter.
x_train, x_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0, shuffle = True)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

#3 instantiate a model logistic regression and fit it to the training set, note you can pass in many tuning parameters
#  in the LogisticRegression() but for now we leave it in the basic form
logreg = LogisticRegression().fit(x_train, y_train)

#4 evaluate the model on the test set
print("Test set score: {:.8f}".format(logreg.score(x_test, y_test)))

Remember, the reason we split our data into training and test sets is that we are interested in measuring how well our model generalizes to new, previously unseen data. We are not interested in how well our model fit the training set, but rather in how well it can make predictions for data that was not observed during training. So we got a decent score of 82% on the supposedly test set.

When evaluating different settings (“hyperparameters”) for estimators, such as the ridge setting that must be manually set for a Logistic Regression, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.

However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.

A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called **k-fold CV**, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:

## Cross Validation: K-fold Cross Validation

The general procedure for **K-fold Cross Validation** is as follows.

1. Shuffle the dataset X randomly - provided you have **`shuffle = True`** in your parameter. It should be on by default.


2. Split the dataset into **k** groups of approximately equal size depending on the **k** value and the size of the dataset (if k = 10, but X.shape = 999, then you cannot divide equally); **k** is a number you can choose, convention is **k** = 5 or 10.


3. A sequence of **models** will be trained **k** number of times. Say **k = 5**, then for each **unique group**, the first model is trained using the first fold as the test set, and the remaining folds (2–5) are used as the training set. The model is built using the data in folds 2–5, and then the accuracy is evaluated on fold 1. Then another model is built, this time using fold 2 as the test set and the data in folds 1, 3, 4, and 5 as the training set. This process is repeated using folds 3, 4, and 5 as test sets. For each of these five splits of the data into training and test sets, we compute the accuracy (or the desired performance metrics).  In the end, we have collected five accuracy values. The process is illustrated in the Figures below.

In [None]:
# Usually, the first fifth of the data is the first fold, the second fifth of the data is the second fold, and so on.
mglearn.plots.plot_cross_validation()

<img src="files/images/cv2.png" width="400">

## Computing Cross Validated Performance Metrics

Cross-validation is implemented in scikit-learn using the **`cross_val_score`** function from the `model_selection` module. The parameters of the **`cross_val_score`** function are the **model** we want to evaluate, the **training data X**, and the **ground-truth labels y**. Let’s evaluate LogisticRegression on the titanic dataset:

We will be using the **`sklearn.model_selection.cross_val_score()`** here and the [sklearn website documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score) is here.

**`Default class: sklearn.model_selection.cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=nan)`** 

**Returns:** Array of scores of the estimator for each run of the cross validation.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

#1 Default: class sklearn.model_selection.KFold(n_splits=5, shuffle=False, random_state=None)
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

#2 normalize it by using our StandardScaler(), will mention more on this later.
X = StandardScaler().fit_transform(X)

#3 instantiate a model logistic regression and fit it to the training set.
logreg = LogisticRegression()


#4 Here, cross_val_score performed 10-fold cross-validation and therefore returns 10 scores, the scoring
#  system is based on the accuracy metric.

print("Accuracy Scores: " + format(cross_val_score(logreg,X,y,cv=k_fold,scoring="accuracy")))
print(" ") 

#5 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds.

print("Mean Accuracy Score: ", cross_val_score(logreg,X,y,cv=k_fold,scoring="accuracy").mean())
print(" ")

#6 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate given by:
#  A variance value of 0.0007 score seems quite reasonably small and tells us the 10 scores are closely tied together
#  and don't fluctuate much, which is good news.
print("Variance of Accuracy:", cross_val_score(logreg,X,y,cv=k_fold,scoring="accuracy").std()**2)

After we choose our way to train the model and get the result and outcomes, it is natural for us to consider the performances of the model! Now the very first step we want to check is the accuracy of the model! The formula for accuracy is simply given by 


$$\text{Accuracy} = \frac{\text{Number of correctly classified cases}}{\text{Number of all cases}}$$


<br>

And as seen above, we first showed all our accuracy scores for each fold in the K-Fold CV validation, and the values are pretty close to each other with a standard deviation of 0.02733 - which in my opinion is a very acceptable range (one can further do some testing to show that the SD is justifiable low).

<br>

So our final accuracy score for Logistic Regression is 82.82 percent. But is that good enough? No. Let me give you one more problem. 

<br>

For example, in our Titanic Problem, we want to classify data into survive or dead. And to make the example simple, we have 1000 data set in which we achieved an 80% accuracy, that is we predicted 800/1000 correctly. This 80% accuracy seems reasonable, but however it can still be extremely bad if we have many False Positives or False Negatives. In our case, let's say we have exactly 500 people who died and 500 people who lived - Our algorithm predicts 500/500 correctly for those people who survived, however only 300/500 correctly for those who died. Yes, the total accuracy does add up to 80 percent and it looks good on paper, but we realised that this algorithm is extremely incapable of prediction for people who died, and that can be a big problem because in reality, we may have say, 900 people who died and 100 who survived, one can imagine how disastrous it can be if we applied the algorithm and it will give you a much lower total accuracy rate!

<br>

**We shall explore some popular metrics to further assess the performance of the model - An extension to evaluate how accurate the model performs.**

By default, the score computed at each CV iteration is the score method of the estimator. It is possible to change this by using the scoring parameter in **`cross_val_score`**. For example, if you want to check the `precision, recall or f1 score` you can pass it in as follows.

In [None]:
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

#1 Default: class sklearn.model_selection.KFold(n_splits=5, shuffle=False, random_state=None)
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

#2 normalize it by using our StandardScaler(), will mention more on this later.
X = StandardScaler().fit_transform(X)

#3 instantiate a model logistic regression and fit it to the training set.
logreg = LogisticRegression()


#4 Here, cross_val_score performed 10-fold cross-validation and therefore returns 10 scores,
#  the scoring metric here demonstrates f1, recall and precision respectively.

print("f1 Scores: " + format(cross_val_score(logreg,X,y,cv=k_fold,scoring="f1")))
print("recall Scores: " + format(cross_val_score(logreg,X,y,cv=k_fold,scoring="recall")))
print("precision Scores: " + format(cross_val_score(logreg,X,y,cv=k_fold,scoring="precision")))
print(" ") 

#5 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds in their
#  respective scoring metrics.

print("Mean f1 Score: ", cross_val_score(logreg,X,y,cv=k_fold,scoring="f1").mean())
print("Mean recall Score: ", cross_val_score(logreg,X,y,cv=k_fold,scoring="recall").mean())
print("Mean precision Score: ", cross_val_score(logreg,X,y,cv=k_fold,scoring="precision").mean())
print(" ")

#6 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate for
#  the respective scoring metrics given by:

print("Variance of f1:", cross_val_score(logreg,X,y,cv=k_fold,scoring="f1").std()**2)
print("Variance of recall:", cross_val_score(logreg,X,y,cv=k_fold,scoring="recall").std()**2)
print("Variance of precision:", cross_val_score(logreg,X,y,cv=k_fold,scoring="precision").std()**2)

## The other Cross Validate function

There is a second function you can use for cross-validation, called **`cross_validate`**. 

The **`cross_validate`** function differs from **`cross_val_score`** in two ways:

- It allows specifying multiple metrics for evaluation.

- It returns a dict containing fit-times, score-times (and optionally training scores as well as fitted estimators) in addition to the test score.

The class is listed as follows: [sklearn website documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html)

**`sklearn.model_selection.cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score=False, return_estimator=False, error_score=nan)`**

Notice that the main difference here is we can specify our **scoring metric** to be more than just 1 metric. This is useful as we can evaluate a few metric at once.

In [None]:
from sklearn.model_selection import cross_validate

scoring_metrics = ['accuracy', 'f1', 'recall']


#1 Default: class sklearn.model_selection.KFold(n_splits=5, shuffle=False, random_state=None)
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

#2 normalize it by using our StandardScaler(), will mention more on this later.
X = StandardScaler().fit_transform(X)

#3 instantiate a model logistic regression and fit it to the training set.
logreg = LogisticRegression()


#4 Here, cross_val_score performed 10-fold cross-validation and therefore returns 10 scores, the scoring
#  system is based on the accuracy, f1 and recall metric.

print("Different Metric Scores: " + format(cross_validate(logreg, X, y,
                                                          cv=k_fold, scoring = scoring_metrics, return_train_score=False)))
print(" ") 



Using pandas, we can nicely display these results and compute summaries:

In [None]:
scores = cross_validate(logreg, X, y, cv=k_fold, scoring = scoring_metrics, return_train_score=False)
df = pd.DataFrame(scores)
display(df)
print("Mean times and scores:\n{}".format(df.mean()))

There are several benefits to using cross-validation instead of a single split into a training and a test set. First, remember that `train_test_split` performs a random split of the data. Imagine that we are “lucky” when randomly splitting the data, and all examples that are hard to classify end up in the training set. In that case, the test set will only contain “easy” examples, and our test set accuracy will be unrealistically high. Conversely, if we are “unlucky,” we might have randomly put all the hard-to-classify examples in the test set and consequently obtain an unrealistically low score. However, when using cross-validation, each example will be in the test set exactly once: each example is in one of the folds, and each fold is the test set once. Therefore, the model needs to generalize well to all of the samples in the dataset for all of the cross-validation scores (and their mean) to be high.

Having multiple splits of the data also provides some information about how sensitive our model is to the selection of the training dataset. For the titanic dataset, we saw accuracies between 80% and 100%. This is quite a range, and it provides us with an idea about how the model might perform in the worst case and best case scenarios when applied to new data.

Another benefit of cross-validation as compared to using a single split of the data is that we use our data more effectively. When using `train_test_split`, we usually use 75% of the data for training and 25% of the data for evaluation. When using five-fold cross-validation, in each iteration we can use four-fifths of the data (80%) to fit the model. When using 10-fold cross-validation, we can use nine-tenths of the data (90%) to fit the model. More data will usually result in more accurate models.

The main disadvantage of cross-validation is increased computational cost. As we are now training k models instead of a single model, cross-validation will be roughly k times slower than doing a single split of the data.

## Split() Super Important

In [None]:
for fold, (train_idx, val_idx) in enumerate(k_fold.split(X, y)):
    print(fold, len(train_idx), len(val_idx))
    #df.loc[val_idx, 'kfold'] = fold

In [None]:
# continuing from the titanic example
loop = 1
for train, validation in k_fold.split(X):
    print("\n \nLoop {}:\n \n train:{}\n \n validation:{}".format(loop,train, validation))
    loop = loop + 1
    
    

The idea is simple, each loop of `k_fold.split(X)` generates the `train and validation`'s folds **indices index**: each fold is constituted by 2 arrays/lists, the first list is the training set, while the second list is the test/validation set. This is extremely useful later on each section 5.

# When to use Different Cross Validation Methods/Iterators

There are more than one cross validation methods. **K-fold** is merely one of them, but if there are so many cross validation strategies, which one should we use? This is important and a general rule of thumb is as follows.

I believe one should also read this [stackexchange link to gain some intuition](https://stats.stackexchange.com/questions/103459/how-do-i-know-which-method-of-cross-validation-is-best).

## When your data is independent i.i.d

Assuming that some data is Independent and Identically Distributed (i.i.d.) is making the assumption that all samples stem from the same generative process and that the generative process is assumed to have no memory of past generated samples.
In Laymen terms, i.i.d just says that each and every training rows you have must be independent.

The following cross-validators can be used in such cases.

**Note:** While i.i.d. data is a common assumption in machine learning theory, it rarely holds in practice. If one knows that the samples have been generated using a time-dependent process, it’s safer to use a [time-series aware cross-validation scheme](https://scikit-learn.org/stable/modules/cross_validation.html#timeseries-cv). Similarly if we know that the generative process has a group structure (samples from collected from different subjects, experiments, measurement devices) it safer to use [group-wise cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html#group-cv).

 I had a rough time trying to understand what does it mean for a dataset to be i.i.d.
 
 I refer you to some great links/articles [here](https://stats.stackexchange.com/questions/213464/on-the-importance-of-the-i-i-d-assumption-in-statistical-learning) and [here](https://www.ijcai.org/Proceedings/07/Papers/121.pdf) and [here](https://www.statisticshowto.datasciencecentral.com/assumption-of-independence/).

The data set is independent/dependent can be illustrated **naively** with examples below. A more detailed and curated version will appear in **Section 2.3**.

*If the occurrence of one event does not affect the occurrence or non-occurrence of the other, we say the events are independent.*

Let the selection of a red ball be event A, and the selection of a black ball be event B. 

[Example](http://www.math.usu.edu/cfairbourn/SCORM/IVC/Chance_sco/Chance_print.html)

**Independent:** A box contains two red balls and four green balls. We randomly select two balls with replacement from the box. These six balls constitute a population. "With replacement" means that once we select the first ball we put it back in the box before we select the second ball. Drawing with replacement makes the draws independent of each other, since the color of the first ball drawn does not affect the color of the second ball. **Event A and Event B are independent since the occurrence of Event A does not affect the occurrence of Event B**.

**Non-Independent:** A box contains two red balls and four green balls. Suppose this time we randomly select two balls without replacement from the box. "Without replacement" means that once we select the first ball we do not put it back in the box before we select the second ball. Drawing without replacement makes the draws dependent events, since the color of the first ball drawn does affect the color of the second ball. **Event A and Event B are dependent since the occurrence of Event A does affect the occurrence of Event B and vice versa**.

### First method: K-Fold

**K-fold** has been explained in section 2, remember we can pass **K-fold** as the **cv** parameter in **cross_val_score**.

### Second method: Repeated K-Fold

[Repeated K-Fold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedKFold.html#sklearn.model_selection.RepeatedKFold) is documented in sklearn here. In repeated cross-validation the data is randomly split into **k** partitions **j** times. The performance of the model can thereby be averaged over several runs, but this is rarely desirable in practice. 

We illustrate it with the same titanic data set.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold

#1 Default: class sklearn.model_selection.RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
repeated_k_fold = RepeatedKFold(n_splits=10, n_repeats = 10, random_state=0)

#2 normalize it by using our StandardScaler(), will mention more on this later.
X = StandardScaler().fit_transform(X)

#3 instantiate a model logistic regression and fit it to the training set.
logreg = LogisticRegression()


#4 Here, cross_val_score performed 10-fold cross-validation and therefore returns 10 scores, the scoring
#  system is based on the accuracy metric.

print("Accuracy Scores: " + format(cross_val_score(logreg,X,y,cv = repeated_k_fold,scoring="accuracy")))
print(" ") 

#5 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds.

print("Mean Accuracy Score: ", cross_val_score(logreg,X,y, cv=repeated_k_fold,scoring="accuracy").mean())
print(" ")

#6 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate given by:
#  A variance value of 0.0007 score seems quite reasonably small and tells us the 10 scores are closely tied together
#  and don't fluctuate much, which is good news.
print("Variance:", cross_val_score(logreg,X,y, cv=repeated_k_fold,scoring="accuracy").std()**2)

### Third Method: Leave One Out (LOO) and Leave P out (LPO)

We will not discuss this method because according to a general rule from literatures, authors and empirical evidence, a 5- or 10- fold cross validation should be preferred to LOO.

### Fourth Method: Random permutations cross-validation a.k.a. Shuffle & Split¶

The **ShuffleSplit** iterator will generate a user defined number of independent train / test dataset splits. Samples are first shuffled and then split into a pair of train and test sets.

It is possible to control the randomness for reproducibility of the results by explicitly seeding the random_state pseudo random number generator.

Have not used myself yet - to be updated.

## When your Data is highly imbalanced

Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold.


**Stratification preserves the same target distribution over different folds (read 6.2.1.1's example).**

### First Method: Stratified K-Fold

#### Balanced number of classes but with an ordered sequence

The first example will showcase a dataset with a balanced number of classes but with an ordered sequence.

Let us deviate a while from titanic and take a look at the iris data set, we print out the Iris labels, the target class is **0, 1 and 2** with **50 occurences each in sequence**. So indeed its target class is balanced because each class 0, 1 and 2 has an equal number of occurences. But the problem here is the sequence.

Why, one may ask. As you can see, the first third of the data is the class 0, the second third is the class 1, and the last third is the class 2. Imagine doing **three-fold cross-validation** on this dataset. The first fold would be only class 0, so in the first split of the data, the test set would be only class 0, and the training set would be only classes 1 and 2. As the classes in training and test sets would be different for all three splits, the three-fold cross-validation accuracy would be zero on this dataset. That is not very helpful, as we can do much better than 0% accuracy on iris. 

Some people might be confused, why would the accuracy of our classifier be 0%? Let us dissect this intuition with an example classifier **Logistic Regression**. The formula for logistic regression is basically something like the following, I am being pedantic here, so don't mind the details: $$P(Y = \text{class i}~|~ X = \text{inputs}) = \beta_0 + \beta_1x_1 + \beta_2x_2 + ...$$

We look at our first fold, our training set **only have classes 1 and 2**, and if we define our class 1 to be variable $x_1$, class 2 to be variable $x_2$ and class 3 to be variable $x_3$, then one should realise in a intuitive manner that the **Logistic Regression** model can only spit out something like $$P(Y = \text{class i=1,2}~|~ X = \text{inputs}) = \beta_0 + \beta_1x_1 + \beta_2x_2$$ 

There is no $x_3$ in our logistic model's equation, simply because there isn't any class 3 in our training set. Therefore the logic is easy, your test set has class 3 labels, but our model can **only ever predict** class 1 and 2. And in this extreme scenario, the **test set** in the first split only consists of class 3 labels, consequently, our classifier never predicts anything correctly.


In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
print("Iris labels:\n{}".format(iris.target))
unique, counts = np.unique(iris.target, return_counts=True)
print("Iris class labels counts:\n", dict(zip(unique, counts)))

In [None]:
mglearn.plots.plot_stratified_cross_validation()

Let us see what the **K-Fold** will yield us, with **shuffle = False** at first! Because *secret, secret*, once you turn on the **shuffle = True**, it actually solves this particular scenario.

So in this code below, we **ONLY SPECIFY** the number of folds to be 3 and validate that our intuition was correct.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


#1 Default: class sklearn.model_selection.KFold(n_splits=5, shuffle = False, random_state=None)
k_fold = KFold(n_splits = 3,  shuffle = False, random_state=0)


#2 instantiate a model logistic regression and fit it to the training set.
logreg = LogisticRegression()


#3 Here, cross_val_score performed 10-fold cross-validation and therefore returns 10 scores, the scoring
#  system is based on the accuracy metric.

print("Accuracy Scores: " + format(cross_val_score(logreg, iris.data, iris.target, cv = k_fold, scoring="accuracy")))
print(" ") 

#4 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds.

print("Mean Accuracy Score: ", cross_val_score(logreg, iris.data, iris.target, cv=k_fold,scoring="accuracy").mean())
print(" ")

#5 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate given by:
#  A variance value of 0.0007 score seems quite reasonably small and tells us the 10 scores are closely tied together
#  and don't fluctuate much, which is good news.
print("Standard Deviation:", cross_val_score(logreg, iris.data, iris.target, cv=k_fold,scoring="accuracy").std()**2)

Indeed, the accuracy for **3 folds** is 0. But if we increase our folds, the accuracy will change drastically for the better. Remember: each fold corresponds to one of the classes in the iris dataset, and so nothing can be learned. Another way to resolve this problem is to shuffle the data instead of stratifying the folds, to remove the ordering of the samples by label. We can do that by setting the shuffle parameter of KFold to True. If we shuffle the data, we also need to fix the random_state to get a reproducible shuffling. Otherwise, each run of cross_val_score would yield a different result, as each time a different split would be used (this might not be a problem, but can be surprising). Shuffling the data before splitting it yields a much better result.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


#1 Default: class sklearn.model_selection.KFold(n_splits=5, shuffle = False, random_state=None)
k_fold = KFold(n_splits = 3,  shuffle = True, random_state=0)


#2 instantiate a model logistic regression and fit it to the training set.
logreg = LogisticRegression()


#3 Here, cross_val_score performed 10-fold cross-validation and therefore returns 10 scores, the scoring
#  system is based on the accuracy metric.

print("Accuracy Scores: " + format(cross_val_score(logreg, iris.data, iris.target, cv = k_fold, scoring="accuracy")))
print(" ") 

#4 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds.

print("Mean Accuracy Score: ", cross_val_score(logreg, iris.data, iris.target, cv=k_fold,scoring="accuracy").mean())
print(" ")

#5 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate given by:
#  A variance value of 0.0007 score seems quite reasonably small and tells us the 10 scores are closely tied together
#  and don't fluctuate much, which is good news.
print("Variance:", cross_val_score(logreg, iris.data, iris.target, cv=k_fold,scoring="accuracy").std()**2)

Note as mentioned, the above problem is mitigated when you turn on **shuffle = True** in **K-Fold**, because when you shuffle the data prior to performing **K-Fold**, then the sequences will be randomly mixed up as well. Which brings us to the next point on why one must perform **Stratified K-Fold** over **K-Fold**.

#### Imbalanced Number of classes

This is a generalization of the above example, for example, if I have a dataset where 90% of the samples are of **class A** and only 10% of the samples are of **class B**, then this is an imbalanced number of classes. This will cause a problem because if you use the normal **K-Fold (with 10 folds)** method, then there is a very high possibility that at step 2 (refer to my general steps for k-fold), **one of the 10 folds (call it fold i)** contains **many many** samples with labels **on class A (due to the sheer amount in class A)**; this is a problem because it obstructs the **real performance of the classifier**. If your classifier is a baseline classifier which naively predicts all data to be of **class A**, then our **fold i as previously defined** will have a close to 100% accuracy rate on that fold! But does it mean that our classifier is good? Not really.

**[Stratified K-Fold's sklearn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold)**: **class sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=None)**

**The implementation is designed to:**

- Generate test sets such that all contain the same distribution of classes, or as close as possible.


- Be invariant to class label: relabelling y = ["Happy", "Sad"] to y = [1, 0] should not change the indices generated.


- Preserve order dependencies in the dataset ordering, when shuffle=False: all samples from class k in some test set were contiguous in y, or separated in y by samples from classes other than k.


- Generate test sets where the smallest and largest differ by at most one sample.

**StratifiedKFold** is a variation of **K-Fold** which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

Here is an example of stratified 5-fold cross-validation on a dataset with 50 samples from two unbalanced classes. We show the number of samples in each class and compare with KFold.

The general procedure for **Stratified K-fold Cross Validation** is as follows.

1. Shuffle the dataset X randomly - provided you have **shuffle = True** in your parameter. But note that shuffling the data in **Stratified K-Fold** will not change each fold's percentage on each class.


2. Find the total number m of training samples in the dataset (use X.shape). Find out the number of unique classes in the dataset X, denote this number by n. For each unique class i = 0 to i = n, count the number of occurrences of each class i, $i \in [0,n]$, denote the occurrences in each class by $o_i$. Finally, divide $o_i$ by the total number m to get $$\text{percentage of class i} = \dfrac{o_i}{m}$$



3. In each class i, split the dataset in class i into **k** groups of approximately equal size depending on the k value; consequently, define each k-fold to be a combination of each fold split according to the class i so that each fold will have $o_i$ percentage of each class.


4. A sequence of **models** will be trained **k** number of times. Say **k = 5**, then for each **unique group**, the first model is trained using the first fold as the test set, and the remaining folds (2–5) are used as the training set. The model is built using the data in folds 2–5, and then the accuracy is evaluated on fold 1. Then another model is built, this time using fold 2 as the test set and the data in folds 1, 3, 4, and 5 as the training set. This process is repeated using folds 3, 4, and 5 as test sets. For each of these five splits of the data into training and test sets, we compute the accuracy.  In the end, we have collected five accuracy values. The process is illustrated in the Figures below.

**Illustration**

The above definition is a bit difficult to chew on, so let me illustrate it with a picture and an example. 

**At step 2,** suppose we have a dataset X with 50 training samples, so m = 50. Then there are 2 classes in this dataset, class 0 and class 1, so n = 2. The occurrences of class 0 is 45/50 times while the occurrences of class 1 is only 5/50 times. We calculate the percentage of class 0 to be $45/50 = 90\%$ and the percentage of class 1 to be $5/50 = 5\%$.

**At step 3,** if we define k to be 5; then we split as follows: 

- class 0 -> 45/5 = 9; class 1 -> 5/5 = 1
- Fold 1 = 9 + 1 = 10
- Fold 2 = 9 + 1 = 10
- ...

As a result, in our first model, if we use fold 1 as test set, then we will be guaranteed to have 9 class 0s and 1 class 1; whereas our training set will guarantee to have 36 class 0s and 4 class 1s. This ensures an equal weightage in assigning the classes - 90% to class 0 and 10% to class 1 - which solves the problem.

One thing I like to add here is, with or without shuffle, we can see that in each fold, there will be 90% of class 0 and 10% of class 1, as illustrated below. On the contrary, if you use **K-Fold** method to do, you can see how imbalanced each fold is, **EVEN IF** you set **shuffle = True** for **K-Fold**, there is a good chance that the test set (or even the train set) consists of only the imbalanced class (class 0) re: 3rd fold in the code below.

<img src="files/images/cv3.png" width="300" height="300">

In [None]:
X_imbalanced, y_imbalanced = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))

X_imbalanced.shape
y_imbalanced.shape

counter = 1
k_fold = KFold(n_splits=5, shuffle = False, random_state = 0)
for train, test in k_fold.split(X_imbalanced, y_imbalanced):    
    print('Fold {}: K-Fold shuffle off: train -  {}   |   test -  {}'.format(
         counter, np.bincount(y[train]), np.bincount(y[test])))
    counter = counter + 1

print("---"*40)


k_fold = KFold(n_splits=5, shuffle = True, random_state = 0)

counter = 1
for train, test in k_fold.split(X_imbalanced, y_imbalanced):
    print('Fold {}: K-Fold shuffle on: train -  {}   |   test -  {}'.format(
         counter, np.bincount(y[train]), np.bincount(y[test])))
    counter = counter + 1

print("---"*40)


strat_k_fold = StratifiedKFold(n_splits=5, shuffle = False, random_state = 0)

counter = 1
for train, test in strat_k_fold.split(X_imbalanced, y_imbalanced):
    print('Fold {}: Stratified K-Fold Shuffle off: train -  {}   |   test -  {}'.format(
        counter, np.bincount(y[train]), np.bincount(y[test])))
    counter = counter + 1
    
print("---"*40)

strat_k_fold = StratifiedKFold(n_splits=5, shuffle = True, random_state = 0)

counter = 1
for train, test in strat_k_fold.split(X_imbalanced, y_imbalanced):
    print('Fold {}: Stratified K-Fold Shuffle on: train -  {}   |   test -  {}'.format(
         counter, np.bincount(y[train]), np.bincount(y[test])))
    counter = counter + 1




We can apply our **Stratified K-Fold** to our titanic. This is because when I checked the class in the Titanic dataset, the class imbalance is kind of huge with 549 dead and 342 survived.

In [None]:
#1 In this tutorial, we will used the titanic data set, we check the unique classes of titanic.
y.value_counts()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

#1 Default: class sklearn.model_selection.KFold(n_splits=5, shuffle=False, random_state=None)
strat_k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

#2 normalize it by using our StandardScaler(), will mention more on this later.
X = StandardScaler().fit_transform(X)

#3 instantiate a model logistic regression and fit it to the training set.
logreg = LogisticRegression()


#4 Here, cross_val_score performed 10-fold cross-validation and therefore returns 10 scores, the scoring
#  system is based on the accuracy metric.

print("Accuracy Scores: " + format(cross_val_score(logreg,X,y,cv=strat_k_fold,scoring="accuracy")))
print(" ") 

#5 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds.

print("Mean Accuracy Score: ", cross_val_score(logreg,X,y,cv=strat_k_fold,scoring="accuracy").mean())
print(" ")

#6 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate given by:
#  A variance value of 0.0007 score seems quite reasonably small and tells us the 10 scores are closely tied together
#  and don't fluctuate much, which is good news.
print("Variance of Accuracy:", cross_val_score(logreg,X,y,cv=strat_k_fold,scoring="accuracy").std()**2)

### Second Method: Repeated Stratified K-Fold

See Repeated K-Fold for reference

## Cross-validation iterators for grouped data

To understand this idea, we have to go back to section 6.1 and quote **Assuming that some data is Independent and Identically Distributed (i.i.d.) is making the assumption that all samples stem from the same generative process and that the generative process is assumed to have no memory of past generated samples.**



Now consider the generated dataset below. There are a total of 11 training samples (i.e. 11 rows).  In an almost universal paradigm for Diabetes prediction, the features will definitely include the Cholesterol Level and the Blood Sugar level. But take note that multiple patients appear in the same dataset. When we take samples from the same patients, our training samples (the 11 rows) are no longer considered independent. Under this paradigm, each patient has an unique underlying, anatomical structure that helps your model to "recognize" this patient's label when the model receives the patient's input features. These input features of the same patient may differ slightly (as you can see from the datase), but in general sense they are highly correlated. A person's internal structure and readings will not fluctuate in a short amount of time should give you some intuition why repeated samples of the same patient may adversely aid your model to overfit in a **cross validation** scheme should both of your train and test set contain the same person.

**Main idea of i.i.d**

The main idea of i.i.d in training samples (rows) is to guarantee that techniques such as cross-validation can indeed be used to infer a reliable measure of the model's capability of generalising well. This is because a stronger case of `i.i.d amongst each training samples` has an **if and only if** relationship with the [Exchangeability Theorem](https://en.wikipedia.org/wiki/Exchangeable_random_variables), where the latter guarantees a reliable measure of cross validation (no overfitting).


**Exchangeable Theorem**


In statistics, an exchangeable sequence of random variables (also sometimes interchangeable) is a sequence $X_1, X_2, X_3, ...$ (which may be finitely or infinitely long) whose joint probability distribution does not change when the positions in the sequence in which finitely many of them appear are altered. Thus, for example the sequences

$$X_{1},X_{2},X_{3},X_{4},X_{5},X_{6} {\text{ and }} X_{3},X_{6},X_{1},X_{5},X_{2},X_{4}$$
both have the same joint probability distribution.

To illustrate why **exchangeable Theorem is so important**, I will lay out the example clearly.

To make a model do its magic, we generally have to assume each training samples come from the same distribution, and any permutations within the samples should not alter the distribution. However, consider employing a simple logistic regression model on this `diabetes` dateset, the **Logistic Model takes in these 11 samples, in theory, if the training samples are i.i.d, then at each training sample, our model should not remember the previous training sample such that it affects his current decision to classify a person**. (Check the definition of i.i.d to further understand this sentence.) However, since our dataset samples has a dependency structure (same person), then you will meet problems when you perform K-Fold cross validation.


**Scenario 1: No shuffling**

Consider 5 folds cross validation, our split will look like 

- Fold 1 = $[John, John]$
- Fold 2 = $[John, John]$
- Fold 3 = $[John, May]$
- Fold 4 = $[May, May]$
- Fold 5 = $[Kim, Kim]$



If we do not shuffle the data, the folds look like the above, so let us say we use Fold 1 to 4 as the training set, our model will take into account the inherent dependency structure and when it is presented with Fold 5 = $[Kim, Kim]$, it is seeing an entirely new set of data and hence it really puts our model to the test (pun intended) on whether it can accurately predict unseen data while taking **the dependency structure in**. For example it predicts one out of two correctly in Fold 5, yielding a 50% accuracy. (of course in real datasets there will be much much more data points in Fold 5.) Although a low accuracy, at least it did not give us a false sense of security.

<br>

**Scenario 2: Shuffling**

If we shuffle our data beforehand, the problem comes. Consider the scenario where our fold looks like this.

- Fold 1 = $[John, Kim]$
- Fold 2 = $[John, May]$
- Fold 3 = $[John, May]$
- Fold 4 = $[Kim, May]$
- Fold 5 = $[John, May]$

If we take Fold 1 to 4 as the training set, and asked to predict Fold 5, our model may not take into account the inherent dependency structure, and happily ingest the first 4 folds and happily predict Fold 5 with 100% accuracy simply because it has seen how John and May behave. Although each of John and May sample have slightly different values, but each sample of the same individual are so **correlated** such that the model has already find out the **underlying structure of John and May, whatever it may be**, and simply cause the model to memorize their labels and when it sees the same person in the test set, it can predict well. But one fatal point is, this often leads to over optimistic accuracy scores in our cross validation process and hence overfits a lot.

In [None]:
# synthetic dataset

d = {'Name': ['John','John','John','John','John', 'May','May','May','Kim','Kim'],
     'Cholesterol Level (mg/dL)': [150.1, 150.3,150.1,152.2,150.09,112.01,112.9,113,180,180.2],
     'Blood Sugar (mg/dL)': [120.2,120.3,120.1,120.1,121.2,99.9,99.8,99.8,150.1,150.2],
     'Diabetes': [1, 1, 1, 1, 1, 0,0,0,1,1,]
     }

diabetes = pd.DataFrame(data=d)
diabetes.set_index('Name', inplace = True)
diabetes

# X = diabetes[['Cholesterol Level (mg/dL)', 'Blood Sugar (mg/dL)']]
# y = [1,1,1,1,1,0,0,0,1,1]

In [None]:
# plot scatter plot to show that if you group the 11 data points into 3 categrories (John, May and Kim)
# you can see that these 3 categories actually form a cluster by themselves, indicating
# correlation within each group/category.

groups = diabetes.groupby('Name')
x = 'Cholesterol Level (mg/dL)'
y = 'Diabetes'


#Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group[x], group[y], marker='o', linestyle='', ms=12, label=name)
ax.legend()

plt.show()


groups = diabetes.groupby('Name')
x = 'Blood Sugar (mg/dL)'
y = 'Diabetes'


#Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group[x], group[y], marker='o', linestyle='', ms=12, label=name)
ax.legend()

plt.show()

**Another example by the book:** If your datasets have training rows that are highly correlated to their groups, then the normal **K-Fold** cross validation strategy will pose an overly optimistic result for your local cv score. This is because if your training set fold (say fold 2-5) has group 1's data (say there are 100 Mary's samples, 80 of which appeared in this fold) and on the same test set fold you also have the remaining 20 Mary's samples, albeit different samples, but belonging to the same group a.k.a Mary! This will make the classifier have an extremely easy job to predict this test set because it is used to **the group identifier: Mary", in other words, we are possibly and potentially leaking future information to the test set!**. Now the classifier will have an easier time predicting because it sees a lot of Mary's examples already, and under this paradigm, correlations clearly exist among both the features and the labels of candidates that refer to the same
underlying structure, blood results, patient, etc. 

So the classifier might actually not **generalize well to unseen data** because it has similar groups. But if we use a normal **cross validation**, we might be lured into a false sense of security that the classifier is performing well simply because our test set may contain similar groups as our training set.

[Third Example: Here is an example taken from a github](https://github.com/ogrisel/notebooks/blob/master/Non%20IID%20cross-validation.ipynb)

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target

The digits dataset of scikit-learn is the test set of the UCI optdigits dataset. Apparently consecutive samples are more likely to stem from the same writer on this dataset. Hence the samples are not independent and identically distributed (iid) as different writing styles grouped togethers effectively introduce a dependency. Unfortunately the exact per-sample authorship metadata has not be kept in the optdigits dataset.

This is highlighted by the fact that shuffling the data significantly affects the test score estimated by K-Fold cross-validation. Let us build a model with non-optimal parameters to highlight the impact of dependent samples:

In [None]:
from sklearn.svm  import SVC

model = SVC(C=10, gamma=0.005)



def print_cv_score_summary(model, X, y, cv):
    scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
    print("mean: {:3f}, stdev: {:3f}".format(
        np.mean(scores), np.std(scores)))
    
    
#1 KFold does not shuffle the data by default hence takes the dependency structure of the dataset
# into account for small number of folds such as k=5.

cv = KFold(n_splits=5, shuffle=False, random_state=0)
print_cv_score_summary(model, X, y, cv)


#2 If we shuffle the data, the estimated test score is much higher as we hide the dependency
#  structure to the model hence we cannot detect the overfitting caused by the author writing styles


cv = KFold(n_splits=5,  shuffle=True, random_state=0)
print_cv_score_summary(model, X, y, cv)


#3 There is almost 7% discrepancy between the estimated score probably caused by the dependency
#between samples. Those shuffled KFold cv scores are in-line with equivalent ShuffleSplit:

from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits = 5, test_size=0.2, random_state=0)
print_cv_score_summary(model, X, y, cv)


#4 Note that StratifiedKFold sorts the samples by classes prior to computing the folds hence
#  breaks the dependency too (at least in scikit-learn 0.14)

cv =  KFold(n_splits = 5,  shuffle = True, random_state=0)
print_cv_score_summary(model, X, y, cv)

### Method 1: Group K-Fold

The syntax is **class sklearn.model_selection.GroupKFold(n_splits=5)**.

K-fold iterator variant with non-overlapping groups. The same group will not appear in two different folds (the number of distinct groups has to be at least equal to the number of folds). The folds are approximately balanced in the sense that the number of distinct groups is approximately the same in each fold.

The following is an example of using a synthetic dataset with a grouping given by the groups array. The dataset consists of 12 data points, and for each of the data points, groups specifies which group (think patient) the point belongs to. The groups specify that there are four groups, and the first three samples belong to the first group, the next four samples belong to the second group, and so on:

In [3]:
from sklearn.model_selection import GroupKFold
from sklearn.datasets import make_blobs
# create synthetic dataset
X, y = make_blobs(n_samples=12, random_state=0)
# assume the first three samples belong to the same group,
# then the next four, etc.
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3))
print("Cross-validation scores:\n{}".format(scores))

NameError: name 'logreg' is not defined

The samples don’t need to be ordered by group; we just did this for illustration purposes. The splits that are calculated based on these labels are visualized in Figure below. As you can see, for each split, each group is either entirely in the training set or entirely in the test set.

This gives us a **non-overlapping grouping system** in which the same group will not appear in both the training fold and the corresponding test fold.

In [None]:
mglearn.plots.plot_group_kfold()

#### Apply Group K-Fold on our example

In [None]:
diabetes2 = diabetes.append(diabetes)
y = diabetes2['Diabetes']
groups = [0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2]
scores = cross_val_score(KNeighborsClassifier(), diabetes2, y, groups, cv=GroupKFold(n_splits=3), scoring = "accuracy")
scores


print("Accuracy Scores: " + format(cross_val_score(KNeighborsClassifier(),diabetes2,y,groups,cv=GroupKFold(n_splits=3),scoring="accuracy")))
print(" ") 

#5 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds.

print("Mean Accuracy Score: ", cross_val_score(KNeighborsClassifier(),diabetes2,y,groups,cv=GroupKFold(n_splits=3),scoring="accuracy").mean())
print(" ")

#6 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate given by:
#  A variance value of 0.0007 score seems quite reasonably small and tells us the 10 scores are closely tied together
#  and don't fluctuate much, which is good news.
print("Variance of Accuracy:", cross_val_score(KNeighborsClassifier(),diabetes2,y,groups,cv=GroupKFold(n_splits=3),scoring="accuracy").std()**2)

In [None]:
diabetes2 = diabetes.append(diabetes)
y = diabetes2['Diabetes']
groups = list(diabetes2.index)
scores = cross_val_score(KNeighborsClassifier(), diabetes2, y, groups, cv=GroupKFold(n_splits=3), scoring = "accuracy")
scores


print("Accuracy Scores: " + format(cross_val_score(KNeighborsClassifier(),diabetes2,y,groups,cv=GroupKFold(n_splits=3),scoring="accuracy")))
print(" ") 

#5 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds.

print("Mean Accuracy Score: ", cross_val_score(KNeighborsClassifier(),diabetes2,y,groups,cv=GroupKFold(n_splits=3),scoring="accuracy").mean())
print(" ")

#6 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate given by:
#  A variance value of 0.0007 score seems quite reasonably small and tells us the 10 scores are closely tied together
#  and don't fluctuate much, which is good news.
print("Variance of Accuracy:", cross_val_score(KNeighborsClassifier(),diabetes2,y,groups,cv=GroupKFold(n_splits=3),scoring="accuracy").std()**2)

## Stratified GroupKFold

In [5]:
from sklearn.model_selection import 

# Feature Selection: Caution on the wrong way of using CV

There is one major caveat [read standford example pdf here](https://web.stanford.edu/class/stats202/content/lec11-cond.pdf) and [here](https://web.stanford.edu/class/stats202/content/lab11) to be read when I approach this topic.

# Pipeline with CV

Just as it is important to test a predictor on data held-out from training, preprocessing (such as standardization, feature selection, etc.) and similar data transformations similarly should be learnt from a training set and applied to held-out data for prediction. Here I will not go into details how to use pipeline, we will dedicate a whole chapter to that.

**Ask ivan, what is perplexing is this result below is the same as my above result, it should be different?**

In [None]:
train = pd.read_csv('data/titanic_train.csv')
test = pd.read_csv("data/titanic_test.csv")
test_labels = pd.read_csv("data/titanic_test_labels.csv")



median_fare = test.groupby(['Pclass', 'Parch']).Fare.median()[3][0]
# Filling the missing value in Fare with the median Fare of 3rd class passenger who has Parch 0.
test['Fare'] = test['Fare'].fillna(median_fare)
train['Embarked'] = train['Embarked'].fillna('S')
train['Age']  = train.groupby(['Pclass', 'SibSp'])['Age'].apply(lambda x: x.fillna(x.median()))

# remember to fill in here, this one really very special case pertaining to this project

train['Age'] = train.Age.fillna(11)
test['Age'] = test.groupby(['Pclass', 'SibSp'])['Age'].apply(lambda x: x.fillna(x.median()))

train['Cabin'] = train['Cabin'].fillna('M').astype(str).apply(lambda cabin: cabin[0])
idx = train[train['Cabin'] == 'T'].index
train.loc[idx, 'Cabin'] = 'A'

test['Cabin'] = test['Cabin'].fillna('M').astype(str).apply(lambda cabin: cabin[0])

# Create function that take name and separates it into title, family name and deletes all puntuation from name column:
def name_sep(data):
    families=[]
    titles = []
    new_name = []
    #for each row in dataset:
    for i in range(len(data)):
        name = data.iloc[i]
        # extract name inside brakets into name_bracket:
        if '(' in name:
            name_no_bracket = name.split('(')[0] 
        else:
            name_no_bracket = name
            
        family = name_no_bracket.split(",")[0]
        title = name_no_bracket.split(",")[1].strip().split(" ")[0]
        
        #remove punctuations accept brackets:
        for c in string.punctuation:
            name = name.replace(c,"").strip()
            family = family.replace(c,"").strip()
            title = title.replace(c,"").strip()
            
        families.append(family)
        titles.append(title)
        new_name.append(name)
            
    return families, titles, new_name 


train['Surname'], train['Title'], train['Newname']  = name_sep(train.Name)
test['Surname'], test['Title'], test['Newname'] = name_sep(test.Name)
train['Title'] = train['Title'].replace(['Ms', 'Mlle'],'Miss')
train['Title'] = train['Title'].replace(['Mme'],'Mrs')
train['Title'] = train['Title'].replace(['Dr','Rev','the','Jonkheer','Lady','Sir', 'Don'],'Nobles')
train['Title'] = train['Title'].replace(['Major','Col', 'Capt'],'Navy')
train.Title.value_counts()



test['Title'] = test['Title'].replace(['Ms','Dona'],'Miss')
test['Title'] = test['Title'].replace(['Dr','Rev'],'Nobles')
test['Title'] = test['Title'].replace(['Col'],'Navy')
test.Title.value_counts()



train_categorical_features = ['Pclass', 'Sex','Title','Cabin', 'Embarked']

# No need to use sklearn's encoders
# pandas has a pandas.get_dummies() function that takes in a series
#     and returns a HOT encoded dataframe of that series
#     use the add_prefix() method of dataframe to add the feature name in front of the category name
#     then join the dataframe sideways (similar to pd.concat([train, dummies], axis=1))
for feature in train_categorical_features:
    dummies = pd.get_dummies(train[feature]).add_prefix(feature + '_')
    train = train.join(dummies)
    
    
    
test_categorical_features = ['Pclass', 'Sex','Title', 'Cabin', 'Embarked']

# No need to use sklearn's encoders
# pandas has a pandas.get_dummies() function that takes in a series
#     and returns a HOT encoded dataframe of that series
#     use the add_prefix() method of dataframe to add the feature name in front of the category name
#     then join the dataframe sideways (similar to pd.concat([train, dummies], axis=1))
for feature in test_categorical_features:
    dummies = pd.get_dummies(test[feature]).add_prefix(feature + '_')
    test = test.join(dummies)
    
    
    
drop_column = ['Pclass','Name','Sex','Cabin', 'Embarked','Surname','Title','Newname', 'Ticket', 'PassengerId']
train.drop(drop_column, axis=1, inplace = True)

drop_column = ['Pclass','Name','Sex','Cabin', 'Embarked','Surname','Title','Newname', 'Ticket', 'PassengerId']
test.drop(drop_column, axis=1, inplace = True)

#1 In this tutorial, we will used the titanic data set.
X = train.drop("Survived", axis=1)
y = train["Survived"]

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
clf = make_pipeline(preprocessing.StandardScaler(), LogisticRegression())
print("Accuracy Scores: " , cross_val_score(logreg,X,y,cv=k_fold,scoring="f1"))


# Out of Fold - The usual way by Kagglers

To be very frank, I was extremely confused with the difference between `Out of Fold predictions` and the normal `K-Fold Cross Validation`, aren't they the same thing? The differences are subtle, but as [Braquino (Fellow Kaggler)](https://www.kaggle.com/braquino) put it, it simply means:**in a cross validation environment, when you split the train and validation dataset in each fold, oof is the validation piece, the part of the dataset that was Out of the model in that fold.**

## Out of Fold Part I - Evaluation

The most common use for `out of fold` predictions is to estimate the **performance** of the model.

That is, predictions on data that were not used to train the model can be made and evaluated using a `scoring metric` such as `rmse` or `accuracy`. This metric provides an estimate of the performance of the model when used to make predictions on new data, such as when the model will be used in practice to make predictions.

Generally, predictions made on data not used to train a model provide insight into how the model will generalize to new situations. As such, scores that evaluate these predictions are referred to as the generalized performance of a machine learning model.

There are two main approaches that these predictions can use to estimate the performance of the model.

The first is to `score` the model on the predictions made during each **fold**, then calculate the average of those scores. For example, if we are evaluating a classification model, then classification accuracy can be calculated on each group of out-of-fold predictions, then the mean accuracy can be reported. This approach is illustrated in earlier sections like **section 1.3**.

### Approach 1: Estimate performance as the mean score estimated on each group of out-of-fold predictions.

In this example here, we used the `mean accuracy score` but in reality, there are many different metrics we can use.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

#1 Default: class sklearn.model_selection.KFold(n_splits=5, shuffle=False, random_state=None)
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

#2 normalize it by using our StandardScaler(), will mention more on this later.
X = StandardScaler().fit_transform(X)

#3 instantiate a model logistic regression and fit it to the training set.
logreg = LogisticRegression()


#4 Here, cross_val_score performed 10-fold cross-validation and therefore returns 10 scores, the scoring
#  system is based on the accuracy metric.

print("Accuracy Scores: " + format(cross_val_score(logreg,X,y,cv=k_fold,scoring="accuracy")))
print(" ") 

#5 A common way to summarize the cross-validation accuracy is to compute the mean of the 10 folds.

print("Mean Accuracy Score: ", cross_val_score(logreg,X,y,cv=k_fold,scoring="accuracy").mean())
print(" ")

#6 Knowing the mean score may not be enough, we can have a 95% confidence interval of the score estimate given by:
#  A variance value of 0.0007 score seems quite reasonably small and tells us the 10 scores are closely tied together
#  and don't fluctuate much, which is good news.
print("Variance of Accuracy:", cross_val_score(logreg,X,y,cv=k_fold,scoring="accuracy").std()**2)

Here we go through how `K-Fold cross_validation_score ` works, a recap of how to use `KFold.split(X)`. This code belows illustrates fully how the `K-Fold cross_validation_score ` is generated via `KFold`. I have commented out each line of code.

In [None]:
scores = list()
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
X = StandardScaler().fit_transform(X)

# enumerate splits
for train_ix, test_ix in k_fold.split(X):
    # for each split, define the training fold to be [train_X, train_y]
    # define the test fold to be [test_X, text_y]
    # in each split, the training and test fold cam be obtained by passing in the 
    # indices indexes train_ix and test_ix
    train_X, test_X = X[train_ix], X[test_ix]
    train_y, test_y = y[train_ix], y[test_ix]
    
    # fit model: fit the model on our training set, for example, in the first loop, 
    # we have our first split, and we fit on that train_X and train_y and get the 
    # necessary parameters for our selected model.
    model = LogisticRegression()
    model.fit(train_X, train_y)
    

    # Predict class labels in test_X
    y_pred = model.predict(test_X)
    
    # 
    acc = accuracy_score(test_y, y_pred)
    
    # store score: Store the scores in our scores variable.
    scores.append(acc)
    print('Accuracy Score ', acc)
    
# summarize model performance with mean and std. Note that this gives us exactly
# what cross_val_score(logreg,X,y,cv=k_fold,scoring="accuracy") does earlier on.
# this implementation is just to facilitate understanding of OOF evaluation next.

mean_score, std_score = np.mean(scores), np.std(scores)
print('Mean: {}.9f, Standard Deviation: {}.3f'.format(mean_score, std_score))

### Approach 2: Estimate performance using the aggregate of all out-of-fold predictions.

The second approach is to consider that each example appears just once in each test set. That is, each example in the training dataset has a single prediction made during the k-fold cross-validation process. As such, we can collect all predictions and compare them to their expected outcome and calculate a score directly across the entire training dataset.

Observe there is a slight difference.

In [None]:
data_y, data_y_pred = list(), list()
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
X = StandardScaler().fit_transform(X)
# enumerate splits
for train_ix, test_ix in k_fold.split(X):
# get data
    train_X, test_X = X[train_ix], X[test_ix]
    train_y, test_y = y[train_ix], y[test_ix]
    # fit model
    model = LogisticRegression()
    model.fit(train_X, train_y)
    # make predictions
    y_pred = model.predict(test_X)
    # store
    data_y.extend(test_y)
    data_y_pred.extend(y_pred)
# evaluate the model
acc = accuracy_score(data_y, data_y_pred)
print('Accuracy: %.9f' % (acc))

## Out of Fold Part II - Emsembling

OOF prediction can be used as **features** in **ensembling method**. This knowledge and techniques require you to first understand how ensembling works, if not you will be as puzzled as me at first.

As said previously, this is the only most basic way of using `oof` as compared to normal method. The real reason why many `Kagglers` use `oof` is because of ensembling. Basically, it is a go to technique when Kagglers use ensemble methods. Below highlights the reason why. All thanks to [Jason Brownlee (click me)](https://machinelearningmastery.com/out-of-fold-predictions-in-machine-learning/).

https://www.kaggle.com/questions-and-answers/52606
https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard