In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from sklearn.tree import DecisionTreeClassifier as DTC

In [6]:
# Performs k-fold cross validation on a given model for some X, y, k
def do_Kfold(model, X, y, k, scaler = None, randomState = 146):
    from sklearn.model_selection import KFold
    
    kf = KFold(n_splits=k, random_state = randomState, shuffle=True)
    train_scores = []
    test_scores = []

    # Test model on each split
    for idxTrain, idxTest in kf.split(X):
        XTrain = X[idxTrain, :]
        XTest = X[idxTest, :]
        yTrain = y[idxTrain]
        yTest = y[idxTest]

        # Apply scalar if necessary
        if scaler != None:
            XTrain = scaler.fit_transform(XTrain)
            XTest = scaler.transform(XTest)

        # Fit model
        model.fit(XTrain,yTrain)

        # Record scores for fitted model
        train_scores.append(model.score(XTrain,yTrain))
        test_scores.append(model.score(XTest,yTest))
        
    # Return scores for k-fold
    return train_scores, test_scores

In [7]:
# Import and preview data set
data = pd.read_csv("sanitized_data/sanitized_data_2024-04-17_10-31-46-218385_mapped.csv")
data.head()

Unnamed: 0,Website,Time,Length,Protocol,Info
0,0,5.869593,66,1,4
1,0,5.881211,66,1,1
2,0,5.887175,54,1,6
3,0,5.887292,712,0,12
4,0,5.902131,1490,0,8


In [8]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = tts(data.drop(columns = ['Website']).values, data['Website'].values, random_state = 201, shuffle = True, test_size = 0.20)