In [19]:
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [20]:
random_state = 40

In [21]:
# do k-cross n times
def k_cross(k, n, X, y, random_state=40):
    f1_scores = []
    recall_scores = []
    rkf = RepeatedKFold(n_splits=k, n_repeats=n, random_state=random_state)
    for train_index, val_index in rkf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        model = build_model() # depends on what model we want here
        model.fit(X_train, y_train) # might need to change this if it's NN
        
        y_pred = model.predict(X_val)
        f1 = f1_score(y_val, y_pred)
        f1_scores.append(f1)
        recall = recall_score(y_val, y_pred)
        recall_scores.append(recall)
        
    return f1_scores, recall_scores

### Example with Logistic regression (transcript)

In [22]:
X_train = np.load("./text/X_train.npy")
y_train = np.load("./text/y_train.npy")

In [23]:
from sklearn import linear_model

In [24]:
def build_model(): # modify this function for different models/parameters
    return linear_model.LogisticRegression()

In [30]:
f1, recall = k_cross(5, 1, X_train, y_train)

In [31]:
print("F1-scores: ", f1)
print("Recall scores: ", recall)

F1-scores:  [0.35294117647058826, 0.45454545454545453, 0.5714285714285713, 0.5263157894736842, 0.380952380952381]
Recall scores:  [0.6, 0.45454545454545453, 0.6, 0.45454545454545453, 0.3076923076923077]
