# Cross-validation

## Cross-validation is a step in the process of building a machine learning model which helps us ensure that our mdels fit the data accurately and also ensures that we do not overfit



In [None]:
# Using the red wine data set
import pandas as pd
df = pd.read_csv("winequality-red.csv")

In [None]:
# Load first 10 rows
df.head(n=10)

In [None]:
# We will be treatign this dataste as a classificaiton problem. S
## Since it only consist of 6 types of quality values, we need to map from 0-5

quality_mapping = {
    3:0,
    4:1,
    5:2,
    6:3,
    7:4,
    8:5
}
# We can use the map fxn of pandas with any disctionary to convert th evalues in a given col to values in the dictionary
df.loc[:, "quality"] = df.quality.map(quality_mapping)


In [None]:
# Check if mapping was successfull
df.head(n=10)
#df.size
#df.shape

In [None]:
# Let's try out a Decision Tree model

# Split the data into training and tets sets
# Use sample with frac=1 to shuffle the dataframe
# we reset the indices since they change after shuffling the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# top 1000 rows are selected for training
df_train = df.head(1000)

#bottom 599 values selected for testing/validation
df_test=df.tail(599)


In [None]:
# import from scikit-learn
from sklearn import tree
from sklearn import metrics

# initialize decision tree classifier class with a max_depth =3
clf = tree.DecisionTreeClassifier(max_depth=3)

# choose the col's you want to train on. These are the features for the model
cols = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']

# train the model on these features and mapped quality from before
clf.fit(df_train[cols], df_train.quality)


In [None]:
# Now we test the accuracy if this model on the training and test sets

# generate predictions on the training set
train_predictions = clf.predict(df_train[cols])

# generate predictions on the test set
test_predictions = clf.predict(df_test[cols])

# calculate the accuracy of the predictions on the training set
train_accuracy = metrics.accuracy_score(
        df_train.quality, train_predictions
)

# calculate the accuracy of the predictions on the test set
test_accuracy = metrics.accuracy_score(
        df_test.quality, test_predictions
)


In [None]:
print(train_accuracy)   

In [None]:
print(test_accuracy)


## Running the same decision tree while toggling through different depth sizes


In [None]:
# import scikit-learn tree and metrics
from sklearn import tree
from sklearn import metrics

# import matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# This is our global size of label text on the plots
matplotlib.rc('xtick', labelsize=20)
matplotlib.rc('ytick', labelsize=20) 

In [None]:
# Add an inline item to ensure the plot is displayed inside the notebook
%matplotlib inline

In [None]:
# Initialize lists to store accuracies for trainign and test sets. We start with a 50% accuracy
train_accuracies = [0.5]
test_accuracies = [0.5]

In [None]:
# iterate over a few depth values
for depth in range(1, 25):
    # initialize model
    clf = tree.DecisionTreeClassifier(max_depth=depth)

    # Choose columns/features for training. This can also be done outside the loop
    cols = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
            'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol'
            ]

    # train the model on these features and mapped quality from before
    clf.fit(df_train[cols], df_train.quality)   

    # Now we test the accuracy if this model on the training and test sets

    # generate predictions on the training and test set
    train_predictions = clf.predict(df_train[cols])
    test_predictions = clf.predict(df_test[cols])

    # calculate the accuracy of the predictions on the training and test set
    train_accuracy = metrics.accuracy_score(
            df_train.quality, train_predictions
    )
    test_accuracy = metrics.accuracy_score(
            df_test.quality, test_predictions
    )

    # Append accuracies
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy) 

In [None]:
# Now we test the accuracy if this model on the training and test sets

# generate predictions on the training and test set
train_predictions = clf.predict(df_train[cols])
test_predictions = clf.predict(df_test[cols])

# calculate the accuracy of the predictions on the training and test set
train_accuracy = metrics.accuracy_score(
        df_train.quality, train_predictions
)
test_accuracy = metrics.accuracy_score(
        df_test.quality, test_predictions
)

# Append accuracies
train_accuracies.append(train_accuracy)
test_accuracies.append(test_accuracy)

In [None]:
# create two plot using matplotlib and seaborn
plt.figure(figsize=(10,5))
sns.set_style("whitegrid")
plt.plot(train_accuracies, label="train accuracy")
plt.plot(test_accuracies, label = "test accuracy")
plt.legend(loc="upper left", prop={'size' : 15})
plt.xticks(range(0, 26, 5))
plt.xlabel("max_depth", size=20)
plt.ylabel("accuracy", size=20)
plt.show()

# This is called Overfitting
- This model performs poorly on the test set
- Test loss increase as we keep improving training loss

## K-fold Cross-Validation   
### Divide the data into k different stes which are exclusive of each other

In [None]:
# import pandas and model_selection module of scikit-learn 
import pandas as pd
from sklearn import model_selection

# Add the ability to import this as a module later on
if __name__ == "__main__":
    # Training data is in a CSV file called train.csv
    df = pd.read_csv("train.csv")

    # we create a new col called kfold and fill it with -1
    df["kfold"]=-1

    # Now we randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # initialize the kfold class form model_selection module
    kf = model_selection.KFold(n_splits=5)

    # Fill the new kfold col
    # Use enumerate method to add a counter to an iterable
    for fold, (trn_, val_) in enumerate(kf.split(X=df)):
        df.loc[val_, 'kfold'] = fold

    # Save the new csv with kfold col
    df.to_csv("train_folds.csv", index=False)
